fix: Huggingface Trust Remote Repo (#9535)

* set trust_remote_code=True in HuggingFace examples
determined-ai · Jun 18, 2024 · 44f446c · 44f446c
1 parent 3320107
commit 44f446c
Show file tree

Hide file tree

Showing 12 changed files with 19 additions and 11 deletions.
diff --git a/examples/hf_trainer_api/hf_image_classification/adaptive.yaml b/examples/hf_trainer_api/hf_image_classification/adaptive.yaml
@@ -46,5 +46,5 @@ entrypoint: >-
   --seed 1337
   --save_strategy steps
   --save_steps 20
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_image_classification/const.yaml b/examples/hf_trainer_api/hf_image_classification/const.yaml
@@ -34,5 +34,5 @@ entrypoint: >-
   --seed 1337
   --save_strategy steps
   --save_steps 20
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_image_classification/const_epochs.yaml b/examples/hf_trainer_api/hf_image_classification/const_epochs.yaml
@@ -34,5 +34,5 @@ entrypoint: >-
   --save_total_limit 3
   --seed 1337
   --save_strategy epoch
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml b/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml
@@ -38,6 +38,6 @@ entrypoint: >-
   --save_strategy steps
   --save_steps 20
   --deepspeed ds_configs/ds_config_stage_1.json
-  --trust_remote_code false
+  --trust_remote_code true
   --fp16
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_image_classification/distributed.yaml b/examples/hf_trainer_api/hf_image_classification/distributed.yaml
@@ -34,5 +34,5 @@ entrypoint: >-
   --seed 1337
   --save_strategy steps
   --save_steps 20
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_image_classification/image_classification.py b/examples/hf_trainer_api/hf_image_classification/image_classification.py
@@ -278,6 +278,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
+            trust_remote_code=True,
         )
     else:
         data_files = {}
@@ -290,6 +291,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
             data_files=data_files,
             cache_dir=model_args.cache_dir,
             task="image-classification",
+            trust_remote_code=True,
         )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -310,7 +312,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
         id2label[str(i)] = label
 
     # Load the accuracy metric from the datasets package
-    metric = datasets.load_metric("accuracy")
+    metric = datasets.load_metric("accuracy", trust_remote_code=True,)
 
     # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
     # predictions and label_ids field) and has to return a dictionary string to float.

diff --git a/examples/hf_trainer_api/hf_language_modeling/adaptive.yaml b/examples/hf_trainer_api/hf_language_modeling/adaptive.yaml
@@ -47,5 +47,5 @@ entrypoint: >-
   --save_steps 20
   --per_device_train_batch_size 8
   --per_device_eval_batch_size 8
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_language_modeling/const.yaml b/examples/hf_trainer_api/hf_language_modeling/const.yaml
@@ -35,5 +35,5 @@ entrypoint: >-
   --save_steps 20
   --per_device_train_batch_size 8
   --per_device_eval_batch_size 8
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_language_modeling/const_epochs.yaml b/examples/hf_trainer_api/hf_language_modeling/const_epochs.yaml
@@ -36,5 +36,5 @@ entrypoint: >-
   --save_steps 20
   --per_device_train_batch_size 8
   --per_device_eval_batch_size 8
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml b/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml
@@ -39,6 +39,6 @@ entrypoint: >-
   --deepspeed ds_configs/ds_config_stage_1.json
   --per_device_train_batch_size 8
   --per_device_eval_batch_size 8
-  --trust_remote_code false
+  --trust_remote_code true
   --fp16
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_language_modeling/distributed.yaml b/examples/hf_trainer_api/hf_language_modeling/distributed.yaml
@@ -35,5 +35,5 @@ entrypoint: >-
   --save_steps 20
   --per_device_train_batch_size 8
   --per_device_eval_batch_size 8
-  --trust_remote_code false
+  --trust_remote_code true
 max_restarts: 0
diff --git a/examples/hf_trainer_api/hf_language_modeling/run_clm.py b/examples/hf_trainer_api/hf_language_modeling/run_clm.py
@@ -367,6 +367,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
             cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
             streaming=data_args.streaming,
+            trust_remote_code=True,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -376,6 +377,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
                 cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
                 streaming=data_args.streaming,
+                trust_remote_code=True,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
@@ -384,6 +386,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
                 cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
                 streaming=data_args.streaming,
+                trust_remote_code=True,
             )
     else:
         data_files = {}
@@ -406,6 +409,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
             cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
             **dataset_args,
+            trust_remote_code=True,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
         if "validation" not in raw_datasets.keys():
@@ -416,6 +420,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
                 cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
                 **dataset_args,
+                trust_remote_code=True,
             )
             raw_datasets["train"] = load_dataset(
                 extension,
@@ -424,6 +429,7 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
                 cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
                 **dataset_args,
+                trust_remote_code=True,
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at