Merge branch 'main' into cuda121

axolotl-ai-cloud · Dec 22, 2023 · 5a84b5c · 5a84b5c
2 parents 65809c4 + 2e61dc3
commit 5a84b5c
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 4 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -43,27 +43,40 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: winglian/axolotl
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
+      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
+      - name: Build and export to Docker
         uses: docker/build-push-action@v5
         with:
           context: .
+          load: true
           build-args: |
             BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
             CUDA=${{ matrix.cuda }}
             PYTORCH_VERSION=${{ matrix.pytorch }}
           file: ./docker/Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
           tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
           labels: ${{ steps.metadata.outputs.labels }}
+      - name: Unit Tests
+        run: |
+          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
+      - name: Push to Docker Hub
+        if: github.event_name != 'pull_request'
+        run: |
+          docker push ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          latest_tag=${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
+          if [ -n "$latest_tag" ]; then
+            docker push "$latest_tag"
+          fi
+
   build-axolotl-runpod:
     needs: build-axolotl
     if: github.repository_owner == 'OpenAccess-AI-Collective'

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -25,6 +25,9 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
         pip install -e .[deepspeed,flash-attn]; \
     fi
 
+# So we can test the Docker image
+RUN pip install pytest
+
 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
     git config --get remote.origin.fetch

diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
@@ -448,6 +448,20 @@ def validate_config(cfg):
     if cfg.neftune_noise_alpha is not None and cfg.neftune_noise_alpha <= 0.0:
         raise ValueError("neftune_noise_alpha must be > 0.0")
 
+    if (
+        cfg.adapter
+        and cfg.tokens
+        and (
+            not cfg.lora_modules_to_save
+            or not all(
+                x in cfg.lora_modules_to_save for x in ["embed_tokens", "lm_head"]
+            )
+        )
+    ):
+        raise ValueError(
+            "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`."
+        )
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
@@ -136,6 +136,23 @@ def load_tokenizer(cfg):
 
     if cfg.special_tokens:
         for k, val in cfg.special_tokens.items():
+            # check if new special token is not already in tokenizer and
+            # is adapter training to make sure lora_modules_to_save is set
+            if (
+                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
+                and cfg.adapter
+                and (
+                    not cfg.lora_modules_to_save
+                    or not all(
+                        x in cfg.lora_modules_to_save
+                        for x in ["embed_tokens", "lm_head"]
+                    )
+                )
+            ):
+                raise ValueError(
+                    "Please set lora_modules_to_save to ['embed_tokens', 'lm_head'] when using an adapter and changing the special tokens."
+                )
+
             tokenizer.add_special_tokens(
                 {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
             )

diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
@@ -3,6 +3,8 @@
 """
 import unittest
 
+import pytest
+
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_tokenizer
 
@@ -31,6 +33,40 @@ def test_dont_use_fast(self):
         tokenizer = load_tokenizer(cfg)
         assert "Fast" not in tokenizer.__class__.__name__
 
+    def test_special_tokens_modules_to_save(self):
+        # setting special_tokens to new token
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+                "adapter": "lora",
+                "special_tokens": {"bos_token": "[INST]"},
+            }
+        )
+        with pytest.raises(
+            ValueError,
+            match=r".*Please set lora_modules_to_save*",
+        ):
+            load_tokenizer(cfg)
+
+        # setting special_tokens but not changing from default
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+                "adapter": "lora",
+                "special_tokens": {"bos_token": "<s>"},
+            }
+        )
+        load_tokenizer(cfg)
+
+        # non-adapter setting special_tokens
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+                "special_tokens": {"bos_token": "[INST]"},
+            }
+        )
+        load_tokenizer(cfg)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -682,6 +682,43 @@ def test_warmup_step_no_conflict(self):
 
         validate_config(cfg)
 
+    def test_add_tokens_adapter(self):
+        cfg = DictDefault(
+            {"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]}
+        )
+
+        with pytest.raises(
+            ValueError,
+            match=r".*lora_modules_to_save not properly set yet adding new tokens*",
+        ):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "adapter": "qlora",
+                "load_in_4bit": True,
+                "tokens": ["<|imstart|>"],
+                "lora_modules_to_save": ["embed_tokens"],
+            }
+        )
+
+        with pytest.raises(
+            ValueError,
+            match=r".*lora_modules_to_save not properly set yet adding new tokens*",
+        ):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "adapter": "qlora",
+                "load_in_4bit": True,
+                "tokens": ["<|imstart|>"],
+                "lora_modules_to_save": ["embed_tokens", "lm_head"],
+            }
+        )
+
+        validate_config(cfg)
+
 
 class ValidationWandbTest(ValidationTest):
     """