axolotl-ai-cloud · winglian · Oct 23, 2023 · Oct 22, 2023
diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml
@@ -1,5 +1,4 @@
 base_model: cerebras/btlm-3b-8k-base
-base_model_config: cerebras/btlm-3b-8k-base
 model_type: AutoModelForCausalLM
 tokenizer_type: GPT2Tokenizer
 trust_remote_code: true

diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
@@ -1,5 +1,4 @@
 base_model: cerebras/Cerebras-GPT-1.3B
-base_model_config: cerebras/Cerebras-GPT-1.3B
 load_in_8bit: false
 load_in_4bit: true
 strict: false

diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
@@ -1,5 +1,4 @@
 base_model: codellama/CodeLlama-13b-hf
-base_model_config: codellama/CodeLlama-13b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
@@ -1,5 +1,4 @@
 base_model: codellama/CodeLlama-13b-hf
-base_model_config: codellama/CodeLlama-13b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
@@ -1,5 +1,4 @@
 base_model: codellama/CodeLlama-34b-hf
-base_model_config: codellama/CodeLlama-34b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
@@ -1,5 +1,4 @@
 base_model: codellama/CodeLlama-34b-hf
-base_model_config: codellama/CodeLlama-34b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
@@ -1,5 +1,4 @@
 base_model: codellama/CodeLlama-7b-hf
-base_model_config: codellama/CodeLlama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
@@ -1,5 +1,4 @@
 base_model: codellama/CodeLlama-7b-hf
-base_model_config: codellama/CodeLlama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml
@@ -1,5 +1,4 @@
 base_model: tiiuae/falcon-7b
-base_model_config: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml
@@ -1,7 +1,6 @@
 # 1b: tiiuae/falcon-rw-1b
 # 40b: tiiuae/falcon-40b
 base_model: tiiuae/falcon-7b
-base_model_config: tiiuae/falcon-7b
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 model_type: AutoModelForCausalLM

diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml
@@ -1,5 +1,4 @@
 base_model: tiiuae/falcon-7b
-base_model_config: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml
@@ -1,5 +1,4 @@
 base_model: EleutherAI/gpt-j-6b
-base_model_config: EleutherAI/gpt-j-6b
 load_in_8bit: false
 load_in_4bit: true
 strict: false

diff --git a/examples/jeopardy-bot/config.yml b/examples/jeopardy-bot/config.yml
@@ -1,5 +1,4 @@
 base_model: huggyllama/llama-7b
-base_model_config: huggyllama/llama-7b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false

diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml
@@ -1,5 +1,4 @@
 base_model: NousResearch/Llama-2-7b-hf
-base_model_config: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml
@@ -1,5 +1,4 @@
 base_model: TheBloke/Llama-2-7B-GPTQ
-base_model_config: TheBloke/Llama-2-7B-GPTQ
 is_llama_derived_model: false
 gptq: true
 gptq_disable_exllama: true

diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
@@ -1,5 +1,4 @@
 base_model: NousResearch/Llama-2-7b-hf
-base_model_config: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
@@ -1,5 +1,4 @@
 base_model: NousResearch/Llama-2-7b-hf
-base_model_config: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
@@ -1,5 +1,4 @@
 base_model: NousResearch/Llama-2-7b-hf
-base_model_config: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true

diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml
@@ -1,5 +1,4 @@
 base_model: PY007/TinyLlama-1.1B-step-50K-105b
-base_model_config: PY007/TinyLlama-1.1B-step-50K-105b
 
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer

diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml
@@ -1,5 +1,4 @@
 base_model: mistralai/Mistral-7B-v0.1
-base_model_config: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 is_mistral_derived_model: true

diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
@@ -1,5 +1,4 @@
 base_model: mistralai/Mistral-7B-v0.1
-base_model_config: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 is_mistral_derived_model: true

diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml
@@ -1,5 +1,4 @@
 base_model: mosaicml/mpt-7b
-base_model_config: mosaicml/mpt-7b
 tokenizer_type: AutoTokenizer
 trust_remote_code: true  # required for mpt as their model class is not merged into transformers yet
 load_in_8bit: false

diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml
@@ -1,5 +1,4 @@
 base_model: openlm-research/open_llama_3b_v2
-base_model_config: openlm-research/open_llama_3b_v2
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false

diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml
@@ -1,5 +1,4 @@
 base_model: openlm-research/open_llama_3b_v2
-base_model_config: openlm-research/open_llama_3b_v2
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true

diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml
@@ -1,5 +1,4 @@
 base_model: openlm-research/open_llama_3b_v2
-base_model_config: openlm-research/open_llama_3b_v2
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false

diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml
@@ -1,5 +1,4 @@
 base_model: microsoft/phi-1_5
-base_model_config: microsoft/phi-1_5
 model_type: MixFormerSequentialForCausalLM
 tokenizer_type: AutoTokenizer
 is_llama_derived_model: false

diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml
@@ -1,5 +1,4 @@
 base_model: microsoft/phi-1_5
-base_model_config: microsoft/phi-1_5
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 is_llama_derived_model: false

diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml
@@ -1,5 +1,4 @@
 base_model: EleutherAI/pythia-12b-deduped
-base_model_config: EleutherAI/pythia-12b-deduped
 base_model_ignore_patterns: pytorch*  # prefer safetensors
 model_type: GPTNeoXForCausalLM
 tokenizer_type: AutoTokenizer

diff --git a/examples/pythia/lora.yml b/examples/pythia/lora.yml
@@ -1,5 +1,4 @@
 base_model: EleutherAI/pythia-1.4b-deduped
-base_model_config: EleutherAI/pythia-1.4b-deduped
 load_in_8bit: true
 datasets:
   - path: teknium/GPT4-LLM-Cleaned

diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml
@@ -1,5 +1,4 @@
 base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
-base_model_config: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 model_type: GPTNeoXForCausalLM
 tokenizer_type: AutoTokenizer
 trust_remote_code:

diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml
@@ -1,5 +1,4 @@
 base_model: replit/replit-code-v1-3b
-base_model_config: replit/replit-code-v1-3b
 trust_remote_code: true
 load_in_8bit: false
 datasets:

diff --git a/examples/xgen-7b/xgen-7b-8k-qlora.yml b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -1,7 +1,6 @@
 # An example finetuning Saleforce's XGen-7b model with 8k context using qlora
 # on Tim Dettmer's Guanaco dataset.
 base_model: Salesforce/xgen-7b-8k-base
-base_model_config: Salesforce/xgen-7b-8k-base
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
@@ -79,6 +79,9 @@ def normalize_config(cfg):
 
     cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
 
+    if not cfg.base_model_config:
+        cfg.base_model_config = cfg.base_model
+
     model_config = load_model_config(cfg)
     cfg.model_config_type = model_config.model_type
 

diff --git a/tests/e2e/test_fused_llama.py b/tests/e2e/test_fused_llama.py
@@ -31,7 +31,6 @@ def test_fft_packing(self):
         cfg = DictDefault(
             {
                 "base_model": "JackFram/llama-68m",
-                "base_model_config": "JackFram/llama-68m",
                 "flash_attention": True,
                 "flash_attn_fuse_qkv": True,
                 "flash_attn_fuse_mlp": True,

diff --git a/tests/e2e/test_lora_llama.py b/tests/e2e/test_lora_llama.py
@@ -29,7 +29,6 @@ def test_lora(self):
         cfg = DictDefault(
             {
                 "base_model": "JackFram/llama-68m",
-                "base_model_config": "JackFram/llama-68m",
                 "tokenizer_type": "LlamaTokenizer",
                 "sequence_len": 1024,
                 "load_in_8bit": True,
@@ -72,7 +71,6 @@ def test_lora_packing(self):
         cfg = DictDefault(
             {
                 "base_model": "JackFram/llama-68m",
-                "base_model_config": "JackFram/llama-68m",
                 "tokenizer_type": "LlamaTokenizer",
                 "sequence_len": 1024,
                 "sample_packing": True,
@@ -117,7 +115,6 @@ def test_lora_gptq(self):
         cfg = DictDefault(
             {
                 "base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
-                "base_model_config": "TheBlokeAI/jackfram_llama-68m-GPTQ",
                 "model_type": "AutoModelForCausalLM",
                 "tokenizer_type": "LlamaTokenizer",
                 "sequence_len": 1024,

diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py
@@ -31,7 +31,6 @@ def test_lora(self):
         cfg = DictDefault(
             {
                 "base_model": "openaccess-ai-collective/tiny-mistral",
-                "base_model_config": "openaccess-ai-collective/tiny-mistral",
                 "flash_attention": True,
                 "sequence_len": 1024,
                 "load_in_8bit": True,
@@ -77,7 +76,6 @@ def test_ft(self):
         cfg = DictDefault(
             {
                 "base_model": "openaccess-ai-collective/tiny-mistral",
-                "base_model_config": "openaccess-ai-collective/tiny-mistral",
                 "flash_attention": True,
                 "sequence_len": 1024,
                 "val_set_size": 0.1,

diff --git a/tests/e2e/test_mistral_samplepack.py b/tests/e2e/test_mistral_samplepack.py
@@ -31,7 +31,6 @@ def test_lora_packing(self):
         cfg = DictDefault(
             {
                 "base_model": "openaccess-ai-collective/tiny-mistral",
-                "base_model_config": "openaccess-ai-collective/tiny-mistral",
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 1024,
@@ -78,7 +77,6 @@ def test_ft_packing(self):
         cfg = DictDefault(
             {
                 "base_model": "openaccess-ai-collective/tiny-mistral",
-                "base_model_config": "openaccess-ai-collective/tiny-mistral",
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 1024,

diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py
@@ -27,7 +27,6 @@ def test_ft(self):
         cfg = DictDefault(
             {
                 "base_model": "microsoft/phi-1_5",
-                "base_model_config": "microsoft/phi-1_5",
                 "trust_remote_code": True,
                 "model_type": "MixFormerSequentialForCausalLM",
                 "tokenizer_type": "AutoTokenizer",
@@ -71,7 +70,6 @@ def test_ft_packed(self):
         cfg = DictDefault(
             {
                 "base_model": "microsoft/phi-1_5",
-                "base_model_config": "microsoft/phi-1_5",
                 "trust_remote_code": True,
                 "model_type": "MixFormerSequentialForCausalLM",
                 "tokenizer_type": "AutoTokenizer",

diff --git a/tests/test_normalize_config.py b/tests/test_normalize_config.py
@@ -37,3 +37,10 @@ def test_lr_as_float(self):
         normalize_config(cfg)
 
         assert cfg.learning_rate == 0.00005
+
+    def test_base_model_config_set_when_empty(self):
+        cfg = self._get_base_cfg()
+        del cfg.base_model_config
+        normalize_config(cfg)
+
+        assert cfg.base_model_config == cfg.base_model