fix conflicts (remove once axolotl-ai-cloud#918 is merged upstream)

kallewoof · Dec 18, 2023 · 8da0fd2 · 8da0fd2
2 parents a887d54 + 1dd792f
commit 8da0fd2
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -544,6 +544,11 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
+
 # A list of one or more datasets to finetune the model with
 datasets:
   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
@@ -1031,12 +1036,14 @@ Add below flag to train command above
 python3 -m axolotl.cli.merge_lora examples/your_config.yml --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
 ```
 
-If you run out of CUDA memory, you can try to merge in system RAM with
+You may need to use the `gpu_memory_limit` and/or `lora_on_cpu` config options to avoid running out of memory. If you still run out of CUDA memory, you can try to merge in system RAM with
 
 ```bash
 CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ...
 ```
 
+although this will be very slow, and using the config options above are recommended instead.
+
 ## Common Errors 🧰
 
 See also the [FAQ's](./docs/faq.md).

diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
@@ -6,6 +6,8 @@ is_llama_derived_model: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
+gpu_memory_limit: 20GiB
+lora_on_cpu: true
 
 datasets:
   - path: mhenrichsen/alpaca_2k_test

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
@@ -292,6 +292,7 @@ def load_model(
         for i in range(torch.cuda.device_count()):
             max_memory[i] = f"{cfg.gpu_memory_limit}GiB"
         max_memory["cpu"] = "256GB"
+
         with init_empty_weights():
             model_canvas = AutoModelForCausalLM.from_config(model_config)
         model_canvas.tie_weights()
@@ -681,13 +682,15 @@ def load_lora(model, cfg, inference=False):
 
     if cfg.lora_model_dir:
         LOG.debug("Loading pretained PEFT - LoRA")
+        model_kwargs: Any = {}
+        if cfg.lora_on_cpu:
+            model_kwargs["max_memory"] = {"cpu": "256GiB"}
+            model_kwargs["device_map"] = {"": "cpu"}
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
             is_trainable=(not inference),
-            # offload_folder="/usr/ssd/offload_dir",
-            # max_memory=max_memory,
-            # device_map=device_map,
+            **model_kwargs,
         )
     else:
         model = get_peft_model(model, lora_config)