Merge branch 'main' into cosine_min_lr

axolotl-ai-cloud · Jan 9, 2024 · 2eb9f6e · 2eb9f6e
2 parents b64432e + c3e8165
commit 2eb9f6e
Show file tree

Hide file tree

Showing 13 changed files with 289 additions and 126 deletions.
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
@@ -3,7 +3,7 @@
 github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
-ko_fi: # Replace with a single Ko-fi username
+ko_fi: axolotl_ai # Replace with a single Ko-fi username
 tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 liberapay: # Replace with a single Liberapay username

diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ Features:
 - [Badge](#badge-)
 - [Community Showcase](#community-showcase)
 - [Contributing](#contributing-)
+- [Sponsors](#sponsors-)
 
 </td>
 <td>
@@ -1150,3 +1151,33 @@ pre-commit install
 # test
 pytest tests/
 ```
+
+## Sponsors 🤝❤
+
+OpenAccess AI Collective is run by volunteer contributors such as [winglian](https://github.com/winglian),
+[NanoCode012](https://github.com/NanoCode012), [tmm1](https://github.com/tmm1),
+[mhenrichsen](https://github.com/mhenrichsen), [casper-hansen](https://github.com/casper-hansen),
+[hamelsmu](https://github.com/hamelsmu) and many more who help us accelerate forward by fixing bugs, answering
+community questions and implementing new features. Axolotl needs donations from sponsors for the compute needed to
+run our unit & integration tests, troubleshooting community issues, and providing bounties. If you love axolotl,
+consider sponsoring the project via [GitHub Sponsors](https://github.com/sponsors/OpenAccess-AI-Collective),
+[Ko-fi](https://ko-fi.com/axolotl_ai) or reach out directly to
+[wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org).
+
+---
+
+#### 💎 Diamond Sponsors - [Contact directly](mailto:wing@openaccessaicollective.org)
+
+---
+
+#### 🥇 Gold Sponsors - $5000/mo
+
+---
+
+#### 🥈 Silver Sponsors - $1000/mo
+
+---
+
+#### 🥉 Bronze Sponsors - $500/mo
+
+---
diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml
@@ -0,0 +1,73 @@
+base_model: microsoft/phi-2
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: garage-bAInd/Open-Platypus
+    type: alpaca
+
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./phi-sft-out
+
+sequence_len: 2048
+sample_packing: false  # currently unsupported
+pad_to_sequence_len:
+
+adapter:
+lora_model_dir:
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.1
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_modules_to_save:
+  - embd
+  - lm_head
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 4
+optimizer: paged_adamw_8bit
+adam_beta2: 0.95
+adam_epsilon: 0.00001
+max_grad_norm: 1.0
+lr_scheduler: cosine
+learning_rate: 1e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: true
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+resize_token_embeddings_to_32x: true
+special_tokens:
+  pad_token: "<|endoftext|>"
diff --git a/requirements.txt b/requirements.txt
@@ -12,6 +12,7 @@ fire
 PyYAML>=6.0
 datasets>=2.15.0
 flash-attn==2.3.3
+fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib
 sentencepiece
 wandb
 einops

diff --git a/setup.py b/setup.py
@@ -17,6 +17,7 @@ def parse_requirements():
                 _dependency_links.append(url)
             elif (
                 "flash-attn" not in line
+                and "flash-attention" not in line
                 and "deepspeed" not in line
                 and line
                 and line[0] != "#"
@@ -51,6 +52,9 @@ def parse_requirements():
         "flash-attn": [
             "flash-attn==2.3.3",
         ],
+        "fused-dense-lib": [
+            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib",
+        ],
         "deepspeed": [
             "deepspeed",
         ],

diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
@@ -34,9 +34,10 @@
 )
 from axolotl.utils.collators import (
     BatchSamplerDataCollatorForSeq2Seq,
+    DataCollatorForSeq2Seq,
     MambaDataCollator,
 )
-from axolotl.utils.samplers import MultipackBatchSampler
+from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup, get_cosine_schedule_with_min_lr
 
 try:
@@ -184,12 +185,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
                 self.args.train_batch_size,
                 drop_last=True,
                 batch_max_len=self._train_batch_size * self.args.max_seq_length,
-                lengths=(
-                    self.train_dataset.data.column("position_ids")
-                    .to_pandas()
-                    .apply(lambda x: x[-1] + 1)
-                    .values
-                ),
+                lengths=get_dataset_lengths(self.train_dataset),
                 packing_efficiency_estimate=self.args.sample_packing_efficiency,
             )
         return super()._get_train_sampler()
@@ -203,12 +199,7 @@ def _get_eval_sampler(
                 self.args.per_device_eval_batch_size,
                 drop_last=True,
                 batch_max_len=self.args.eval_batch_size * self.args.max_seq_length,
-                lengths=(
-                    eval_dataset.data.column("position_ids")
-                    .to_pandas()
-                    .apply(lambda x: x[-1] + 1)
-                    .values
-                ),
+                lengths=get_dataset_lengths(eval_dataset),
                 packing_efficiency_estimate=self.args.sample_packing_efficiency,
             )
         return super()._get_eval_sampler(eval_dataset)
@@ -859,7 +850,14 @@ def build_collator(self, training_args: AxolotlTrainingArguments, **kwargs):
         if self.cfg.model_config_type == "mamba":
             return MambaDataCollator(tokenizer=self.tokenizer)
 
-        return BatchSamplerDataCollatorForSeq2Seq(
+        if training_args.sample_packing:
+            return BatchSamplerDataCollatorForSeq2Seq(
+                self.tokenizer,
+                return_tensors="pt",
+                **kwargs,
+            )
+
+        return DataCollatorForSeq2Seq(
             self.tokenizer,
             return_tensors="pt",
             **kwargs,