axolotl-ai-cloud · winglian · Oct 8, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023
diff --git a/README.md b/README.md
@@ -396,15 +396,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
 <summary>All yaml options</summary>
 
 ```yaml
-# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# this can also be a relative path to a model on disk
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# This can also be a relative path to a model on disk
 base_model: ./llama-7b-hf
-# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
 base_model_ignore_patterns:
-# if the base_model repo on hf hub doesn't include configuration .json files,
-# you can set that here, or leave this empty to default to base_model
+# If the base_model repo on hf hub doesn't include configuration .json files,
+# You can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
-# you can specify to choose a specific model revision from huggingface hub
+# You can specify to choose a specific model revision from huggingface hub
 model_revision:
 # Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
@@ -419,23 +419,24 @@ trust_remote_code:
 tokenizer_use_fast:
 # Whether to use the legacy tokenizer setting, defaults to True
 tokenizer_legacy:
-# resize the model embeddings when new tokens are added to multiples of 32
-# this is reported to improve training speed on some models
+# Resize the model embeddings when new tokens are added to multiples of 32
+# This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
 
-# used to identify which the model is based on
+# Ysed to identify which the model is based on
 is_falcon_derived_model:
 is_llama_derived_model:
+# Please note that if you set this to true, `padding_side` will be set to "left" by default
 is_mistral_derived_model:
 
-# whether you are training a 4-bit GPTQ quantized model
+# Whether you are training a 4-bit GPTQ quantized model
 gptq: true
 gptq_groupsize: 128 # group size
 gptq_model_v1: false # v1 or v2
 
-# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 load_in_8bit: true
-# use bitsandbytes 4 bit
+# Use bitsandbytes 4 bit
 load_in_4bit:
 
 # Use CUDA bf16
@@ -449,9 +450,9 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 
-# a list of one or more datasets to finetune the model with
+# A list of one or more datasets to finetune the model with
 datasets:
-  # hf dataset repo | "json" for local dataset, make sure to fill data_files
+  # HuggingFace dataset repo | "json" for local dataset, make sure to fill data_files
   - path: vicgalle/alpaca-gpt4
   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -485,7 +486,7 @@ datasets:
 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
-# push prepared dataset to hub
+# Push prepared dataset to hub
 push_dataset_to_hub: # repo path
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
@@ -495,8 +496,8 @@ hub_model_id: # repo path to push finetuned model
 # how to push checkpoints to hub
 # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
 hub_strategy:
-# whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
-# required to be true when used in combination with `push_dataset_to_hub`
+# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# Required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
 # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
 val_set_size: 0.04
@@ -505,17 +506,17 @@ dataset_shard_num:
 # Index of shard to use for whole dataset
 dataset_shard_idx:
 
-# the maximum length of an input to train with, this should typically be less than 2048
+# The maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
-# pad inputs so each step uses constant sized buffers
+# Pad inputs so each step uses constant sized buffers
 # this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
 pad_to_sequence_len:
-# max sequence length to concatenate training samples together up to
+# Max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # FutureWarning: This will soon be DEPRECATED
 max_packed_sequence_len: 1024
-# use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
 sample_packing:
 # set to 'false' if getting errors during eval with sample_packing on.
 eval_sample_packing:
@@ -524,10 +525,11 @@ eval_sample_packing:
 sample_packing_eff_est:
 total_num_tokens:
 
-# if you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
 adapter: lora
-# if you already have a lora model trained that you want to load, put that here
-# lora hyperparameters
+# If you already have a lora model trained that you want to load, put that here
+# lora hyperparameters.
+# This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
 lora_model_dir:
 lora_r: 8
 lora_alpha: 16
@@ -544,32 +546,39 @@ lora_target_linear: # if true, will target all linear layers
 lora_modules_to_save:
 #  - embed_tokens
 #  - lm_head
+
+# Once you complete training, the model will be saved to the following directory.
+# If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
+# Make sure `lora_model_dir` points to this directory if you want to use the trained model.
 lora_out_dir:
 lora_fan_in_fan_out: false
 
 # ReLoRA configuration
-# must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-relora_steps: # number of steps per ReLoRA restart
-relora_warmup_steps: # number of per-restart warmup steps
-relora_cpu_offload: # true to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+relora_steps: # Number of steps per ReLoRA restart
+relora_warmup_steps: # Number of per-restart warmup steps
+relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
 
 # wandb configuration if you're using it
 wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
-wandb_project: # your wandb project name
-wandb_entity: # a wandb Team name if using a Team
+wandb_project: # Your wandb project name
+wandb_entity: # A wandb Team name if using a Team
 wandb_watch:
-wandb_run_id: # set the name of your wandb run
+wandb_run_id: # Set the name of your wandb run
 wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
 
-# where to save the finished model to
+# Where to save the full-finetuned model to
 output_dir: ./completed-model
 
-# whether to use torch.compile and which backend to use
+# Whether to use torch.compile and which backend to use
 torch_compile:  # bool
 torch_compile_backend:  # Optional[str]
 
-# training hyperparameters
+# Training hyperparameters
+
+# With this much, backpropagation will be skipped and the gradients will be accumulated and applied in the later step.
 gradient_accumulation_steps: 1
+# The number of samples to accumulate gradients for, before performing a backward/update pass.
 micro_batch_size: 2
 eval_batch_size:
 num_epochs: 3
@@ -581,6 +590,9 @@ save_strategy: # set to `no` to skip checkpoint saves
 save_steps: # leave empty to save at each epoch
 eval_steps: # leave empty to eval at each epoch
 save_total_limit: # checkpoints saved at a time
+# Maximum number of iterations to train for. It precedes num_epochs which means that
+# if both are set, num_epochs will not be guaranteed.
+# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
 max_steps:
 
 eval_table_size: # approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
@@ -589,7 +601,7 @@ eval_table_max_new_tokens: # total number of tokens generated for predictions se
 # save model as safetensors (require safetensors package)
 save_safetensors:
 
-# whether to mask out or include the human's prompt from the training labels
+# Whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # group similarly sized data to minimize padding
 # may be slower to start, as it must download and sort the entire dataset
@@ -599,18 +611,18 @@ group_by_length: false
 # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 
-# stop training after this many evaluation losses have increased in a row
+# Stop training after this many evaluation losses have increased in a row
 # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
 early_stopping_patience: 3
 
-# specify a scheduler and kwargs to use with the optimizer
+# Specify a scheduler and kwargs to use with the optimizer
 lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 
-# for one_cycle optim
+# For one_cycle optim
 lr_div_factor: # learning rate div factor
 
-# for log_sweep optim
+# For log_sweep optim
 log_sweep_min_lr:
 log_sweep_max_lr:
 
@@ -679,19 +691,21 @@ auto_resume_from_checkpoints: false
 # don't mess with this, it's here for accelerate and torchrun
 local_rank:
 
-# add or change special tokens
+# Add or change special tokens.
+# If you add tokens here, you don't need to add them to the `tokens` list.
 special_tokens:
   # bos_token: "<s>"
   # eos_token: "</s>"
   # unk_token: "<unk>"
-# add extra tokens
+
+# Add extra tokens.
 tokens:
 
 # FSDP
 fsdp:
 fsdp_config:
 
-# Deepspeed config path
+# Deepspeed config path. e.g., deepspeed/zero3.json
 deepspeed:
 
 # Advanced DDP Arguments