mosaicml · irenedea · Apr 10, 2024 · Apr 10, 2024
@@ -114,6 +114,13 @@ def parse_args() -> Namespace:
         help='If true, reprocess the input_folder to mds format. Otherwise, ' +
         'only reprocess upon changes to the input folder or dataset creation parameters.',
     )
+    parser.add_argument(
+        '--trust-remote-code',
+        type=bool,
+        required=False,
+        default=False,
+        help='If true, allows custom code to be executed to load the tokenizer',
+    )
 
     parsed = parser.parse_args()
 
@@ -124,7 +131,8 @@ def parse_args() -> Namespace:
             parser.error(
                 'Cannot set --eos_text with --use_tokenizer_eos. Please specify one.'
             )
-        tokenizer = AutoTokenizer.from_pretrained(parsed.tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(
+            parsed.tokenizer, trust_remote_code=parsed.trust_remote_code)
         parsed.eos_text = tokenizer.eos_token
 
     # now that we have validated them, change BOS/EOS to strings
@@ -171,6 +179,7 @@ def get_task_args(
     bos_text: str,
     no_wrap: bool,
     compression: str,
+    trust_remote_code: bool,
 ) -> Iterable:
     """Get download_and_convert arguments split across n_groups.
 
@@ -187,6 +196,7 @@ def get_task_args(
         bos_text (str): Text to prepend to each example to separate concatenated samples
         no_wrap: (bool): Whether to let text examples wrap across multiple training examples
         compression (str): The compression algorithm to use for MDS writing
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
     """
     num_objects = len(object_names)
     objs_per_group = math.ceil(num_objects / n_groups)
@@ -202,6 +212,7 @@ def get_task_args(
             bos_text,
             no_wrap,
             compression,
+            trust_remote_code,
         )
 
 
@@ -223,6 +234,7 @@ def download_and_convert(
     bos_text: str,
     no_wrap: bool,
     compression: str,
+    trust_remote_code: bool,
 ):
     """Downloads and converts text fies to MDS format.
 
@@ -236,6 +248,7 @@ def download_and_convert(
         bos_text (str): Text to prepend to each example to separate concatenated samples
         no_wrap: (bool): Whether to let text examples wrap across multiple training examples
         compression (str): The compression algorithm to use for MDS writing
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
     """
     object_store = maybe_create_object_store_from_uri(input_folder)
 
@@ -244,7 +257,8 @@ def download_and_convert(
         downloading_iter = DownloadingIterable(object_names=file_names,
                                                output_folder=tmp_dir,
                                                object_store=object_store)
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name, trust_remote_code=trust_remote_code)
         tokenizer.model_max_length = 5000000000  # Hack to prevent warnings from HuggingFace
 
         # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
@@ -353,6 +367,7 @@ def convert_text_to_mds(
     processes: int,
     args_str: str,
     reprocess: bool,
+    trust_remote_code: bool,
 ):
     """Convert a folder of text files to MDS format.
 
@@ -368,6 +383,7 @@ def convert_text_to_mds(
         processes (int): The number of processes to use.
         args_str (str): String representation of the arguments
         reprocess (bool): Whether to always reprocess the given folder of text files
+        trust_remote_code (bool): If true, allows custom code to be executed to load the tokenizer
     """
     is_remote_output = is_remote_path(output_folder)
 
@@ -396,7 +412,7 @@ def convert_text_to_mds(
         # Download and convert the text files in parallel
         args = get_task_args(object_names, local_output_folder, input_folder,
                              processes, tokenizer_name, concat_tokens, eos_text,
-                             bos_text, no_wrap, compression)
+                             bos_text, no_wrap, compression, trust_remote_code)
         with ProcessPoolExecutor(max_workers=processes) as executor:
             list(executor.map(download_and_convert_starargs, args))
 
@@ -405,7 +421,7 @@ def convert_text_to_mds(
     else:
         download_and_convert(object_names, local_output_folder, input_folder,
                              tokenizer_name, concat_tokens, eos_text, bos_text,
-                             no_wrap, compression)
+                             no_wrap, compression, trust_remote_code)
 
     # Write a done file with the args and object names
     write_done_file(local_output_folder, args_str, object_names)
@@ -462,6 +478,7 @@ def _args_str(original_args: Namespace) -> str:
                             compression=args.compression,
                             processes=args.processes,
                             reprocess=args.reprocess,
+                            trust_remote_code=args.trust_remote_code,
                             args_str=_args_str(args))
     except Exception as e:
         if mosaicml_logger is not None:

diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -106,6 +106,7 @@ def call_convert_text_to_mds() -> None:
             processes=processes,
             args_str='Namespace()',
             reprocess=False,
+            trust_remote_code=False,
         )
 
     call_convert_text_to_mds()
@@ -195,6 +196,7 @@ def call_convert_text_to_mds(reprocess: bool):
             processes=1,
             args_str='Namespace()',
             reprocess=reprocess,
+            trust_remote_code=False,
         )
 
     # Create input text data
@@ -234,6 +236,7 @@ def test_input_folder_not_exist(tmp_path: pathlib.Path):
             processes=1,
             args_str='Namespace()',
             reprocess=False,
+            trust_remote_code=False,
         )