diff --git a/README.md b/README.md index 593eff6c3..60013df93 100644 --- a/README.md +++ b/README.md @@ -487,6 +487,9 @@ datasets: dataset_prepared_path: data/last_run_prepared # push prepared dataset to hub push_dataset_to_hub: # repo path +# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` +# if not set. +dataset_processes: # defaults to os.cpu_count() if not set # push checkpoints to hub hub_model_id: # repo path to push finetuned model # how to push checkpoints to hub diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py index 5a034ea0f..a8c41d95b 100644 --- a/src/axolotl/utils/config.py +++ b/src/axolotl/utils/config.py @@ -75,6 +75,8 @@ def normalize_config(cfg): else: cfg.torch_dtype = torch.float32 + cfg.dataset_processes = cfg.dataset_processes or os.cpu_count() + model_config = load_model_config(cfg) cfg.model_config_type = model_config.model_type diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 3c75e4ec5..80ee5c8c6 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -400,19 +400,25 @@ def disable_datasets_caching(): def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len) with zero_first(is_main_process()): - train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count()) + train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes) if eval_dataset: - eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count()) + eval_dataset = eval_dataset.filter( + drop_long, num_proc=cfg.dataset_processes + ) if cfg.group_by_length: - train_dataset = train_dataset.map(add_length, num_proc=os.cpu_count()) + train_dataset = train_dataset.map( + add_length, num_proc=cfg.dataset_processes + ) if cfg.sample_packing: - train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count()) + train_dataset = train_dataset.map( + add_position_ids, num_proc=cfg.dataset_processes + ) if cfg.eval_sample_packing is not False: if eval_dataset: eval_dataset = eval_dataset.map( - add_position_ids, num_proc=os.cpu_count() + add_position_ids, num_proc=cfg.dataset_processes ) # Phi doesn't want the attention_mask feature when training