From 2c8b4367019609ba3dadef84aa3ef70482d7f7b6 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 18 Sep 2023 22:43:12 -0400 Subject: [PATCH 1/3] more sane defaults for openllama 3b used for quickstarts --- examples/openllama-3b/config.yml | 18 +++++++++--------- examples/openllama-3b/lora.yml | 20 ++++++++++---------- examples/openllama-3b/qlora.yml | 24 ++++++++++++------------ 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 0d8144d6b..16abafb15 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.02 adapter: lora_model_dir: -sequence_len: 256 -max_packed_sequence_len: +sequence_len: 1024 +sample_packing: true lora_r: lora_alpha: lora_dropout: @@ -29,28 +29,28 @@ wandb_log_model: output_dir: ./openllama-out gradient_accumulation_steps: 1 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine -learning_rate: 0.00001 +learning_rate: 0.000003 train_on_inputs: false group_by_length: false float16: true bf16: false -fp16: false +fp16: true tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: true -flash_attention: +xformers_attention: +flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 10 -eval_steps: 50 +warmup_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml index acf0826c9..9373233d6 100644 --- a/examples/openllama-3b/lora.yml +++ b/examples/openllama-3b/lora.yml @@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.02 adapter: lora lora_model_dir: -sequence_len: 256 -max_packed_sequence_len: +sequence_len: 1024 +sample_packing: true lora_r: 8 lora_alpha: 16 lora_dropout: 0.0 @@ -33,9 +33,9 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-out -batch_size: 16 -micro_batch_size: 4 -num_epochs: 3 +gradient_accumulation_steps: 1 +micro_batch_size: 2 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine @@ -50,16 +50,16 @@ early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: true -flash_attention: +xformers_attention: +flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 10 -eval_steps: 50 +warmup_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: -weight_decay: 0.0 +weight_decay: 0.1 fsdp: fsdp_config: special_tokens: diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index d8c43df82..ffb1a668f 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.01 adapter: qlora lora_model_dir: -sequence_len: 2048 -max_packed_sequence_len: 2048 +sequence_len: 1024 +sample_packing: true lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 @@ -27,8 +27,8 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./qlora-out -batch_size: 4 -micro_batch_size: 4 +gradient_accumulation_steps: 1 +micro_batch_size: 2 num_epochs: 2 optimizer: paged_adamw_32bit torchdistx_path: @@ -36,24 +36,24 @@ lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false -bf16: true -fp16: false -tf32: true +bf16: false +fp16: true +tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: true -flash_attention: +xformers_attention: +flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 10 -eval_steps: 20 +warmup_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: -weight_decay: 0.0 +weight_decay: 0.1 fsdp: fsdp_config: special_tokens: From 5b41a8be7916d808d6e687be15ea11c741c45c58 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 18 Sep 2023 22:54:07 -0400 Subject: [PATCH 2/3] don't use bf16 for quickstart to simplify gpu compatibility --- examples/openllama-3b/config.yml | 2 +- examples/openllama-3b/qlora.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 16abafb15..9de748676 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -38,7 +38,7 @@ train_on_inputs: false group_by_length: false float16: true bf16: false -fp16: true +fp16: false tf32: false gradient_checkpointing: true early_stopping_patience: diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index ffb1a668f..df8e49cb8 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -29,7 +29,7 @@ wandb_log_model: output_dir: ./qlora-out gradient_accumulation_steps: 1 micro_batch_size: 2 -num_epochs: 2 +num_epochs: 4 optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine From b5863d645fa2f4c591e339aad22fd19cbf1d3421 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 19 Sep 2023 00:39:59 -0400 Subject: [PATCH 3/3] use the update openlm-research/open_llama_3b_v2 models --- examples/openllama-3b/config.yml | 4 ++-- examples/openllama-3b/lora.yml | 4 ++-- examples/openllama-3b/qlora.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 9de748676..961aeabda 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -1,5 +1,5 @@ -base_model: openlm-research/open_llama_3b -base_model_config: openlm-research/open_llama_3b +base_model: openlm-research/open_llama_3b_v2 +base_model_config: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml index 9373233d6..17fa7fa8b 100644 --- a/examples/openllama-3b/lora.yml +++ b/examples/openllama-3b/lora.yml @@ -1,5 +1,5 @@ -base_model: openlm-research/open_llama_3b -base_model_config: openlm-research/open_llama_3b +base_model: openlm-research/open_llama_3b_v2 +base_model_config: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: true diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index df8e49cb8..deba03fd5 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -1,5 +1,5 @@ -base_model: openlm-research/open_llama_3b -base_model_config: openlm-research/open_llama_3b +base_model: openlm-research/open_llama_3b_v2 +base_model_config: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false