Skip to content

Commit

Permalink
attempt to also run e2e tests that needs gpus (#1070)
Browse files Browse the repository at this point in the history
* attempt to also run e2e tests that needs gpus

* fix stray quote

* checkout specific github ref

* dockerfile for tests with proper checkout

ensure wandb is dissabled for docker pytests
clear wandb env after testing
clear wandb env after testing
make sure to provide a default val for pop
tryin skipping wandb validation tests
explicitly disable wandb in the e2e tests
explicitly report_to None to see if that fixes the docker e2e tests
split gpu from non-gpu unit tests
skip bf16 check in test for now
build docker w/o cache since it uses branch name ref
revert some changes now that caching is fixed
skip bf16 check if on gpu w support

* pytest skip for auto-gptq requirements

* skip mamba tests for now, split multipack and non packed lora llama tests

* split tests that use monkeypatches

* fix relative import for prev commit

* move other tests using monkeypatches to the correct run
  • Loading branch information
winglian committed Jan 10, 2024
1 parent 9be92d1 commit 788649f
Show file tree
Hide file tree
Showing 13 changed files with 214 additions and 105 deletions.
12 changes: 10 additions & 2 deletions .github/workflows/tests-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ jobs:
PYTORCH_VERSION="${{ matrix.pytorch }}"
# Build the Docker image
docker build . \
--file ./docker/Dockerfile \
--file ./docker/Dockerfile-tests \
--build-arg BASE_TAG=$BASE_TAG \
--build-arg CUDA=$CUDA \
--build-arg GITHUB_REF=$GITHUB_REF \
--build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
--tag test-axolotl
--tag test-axolotl \
--no-cache
- name: Unit Tests w docker image
run: |
docker run --rm test-axolotl pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
- name: GPU Unit Tests w docker image
run: |
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm test-axolotl pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
- name: GPU Unit Tests monkeypatched w docker image
run: |
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm test-axolotl pytest /workspace/axolotl/tests/e2e/patched/
40 changes: 40 additions & 0 deletions docker/Dockerfile-tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
ARG BASE_TAG=main-base
FROM winglian/axolotl-base:$BASE_TAG

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
ARG CUDA="118"
ENV BNB_CUDA_VERSION=$CUDA
ARG PYTORCH_VERSION="2.0.1"
ARG GITHUB_REF="main"

ENV PYTORCH_VERSION=$PYTORCH_VERSION

RUN apt-get update && \
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev

WORKDIR /workspace

RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git

WORKDIR /workspace/axolotl

RUN git fetch origin +$GITHUB_REF && \
git checkout FETCH_HEAD

# If AXOLOTL_EXTRAS is set, append it in brackets
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
else \
pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
fi

# So we can test the Docker image
RUN pip install pytest

# fix so that git fetch/pull from remote works
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
git config --get remote.origin.fetch

# helper for huggingface-login cli
RUN git config --global credential.helper store
Empty file added tests/e2e/patched/__init__.py
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import with_temp_dir
from ..utils import with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down
126 changes: 126 additions & 0 deletions tests/e2e/patched/test_lora_llama_multipack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
E2E tests for lora llama
"""

import logging
import os
import unittest
from pathlib import Path

import pytest
from transformers.utils import is_auto_gptq_available, is_torch_bf16_gpu_available

from axolotl.cli import load_datasets
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from ..utils import with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"


class TestLoraLlama(unittest.TestCase):
"""
Test case for Llama models using LoRA w multipack
"""

@with_temp_dir
def test_lora_packing(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024,
"sample_packing": True,
"flash_attention": True,
"load_in_8bit": True,
"adapter": "lora",
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.1,
"special_tokens": {
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
},
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
},
],
"num_epochs": 2,
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"lr_scheduler": "cosine",
}
)
if is_torch_bf16_gpu_available():
cfg.bf16 = True
else:
cfg.fp16 = True

normalize_config(cfg)
cli_args = TrainerCliArgs()
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

@pytest.mark.skipif(not is_auto_gptq_available(), reason="auto-gptq not available")
@with_temp_dir
def test_lora_gptq_packed(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
"model_type": "AutoModelForCausalLM",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024,
"sample_packing": True,
"flash_attention": True,
"load_in_8bit": True,
"adapter": "lora",
"gptq": True,
"gptq_disable_exllama": True,
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.1,
"special_tokens": {
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
},
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
},
],
"num_epochs": 2,
"save_steps": 0.5,
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"lr_scheduler": "cosine",
}
)
normalize_config(cfg)
cli_args = TrainerCliArgs()
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import with_temp_dir
from ..utils import with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import with_temp_dir
from ..utils import with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_model, load_tokenizer

from .utils import with_temp_dir
from ..utils import with_temp_dir


class TestModelPatches(unittest.TestCase):
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/test_resume.py → tests/e2e/patched/test_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import most_recent_subdir, with_temp_dir
from ..utils import most_recent_subdir, with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand All @@ -29,7 +29,7 @@ class TestResumeLlama(unittest.TestCase):
"""

@with_temp_dir
def test_resume_qlora(self, temp_dir):
def test_resume_qlora_packed(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
Expand Down
93 changes: 0 additions & 93 deletions tests/e2e/test_lora_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,96 +65,3 @@ def test_lora(self, temp_dir):

train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

@with_temp_dir
def test_lora_packing(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024,
"sample_packing": True,
"flash_attention": True,
"load_in_8bit": True,
"adapter": "lora",
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.1,
"special_tokens": {
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
},
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
},
],
"num_epochs": 2,
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"lr_scheduler": "cosine",
"bf16": True,
}
)
normalize_config(cfg)
cli_args = TrainerCliArgs()
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

@with_temp_dir
def test_lora_gptq(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
"model_type": "AutoModelForCausalLM",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024,
"sample_packing": True,
"flash_attention": True,
"load_in_8bit": True,
"adapter": "lora",
"gptq": True,
"gptq_disable_exllama": True,
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.1,
"special_tokens": {
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
},
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
},
],
"num_epochs": 2,
"save_steps": 0.5,
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"lr_scheduler": "cosine",
}
)
normalize_config(cfg)
cli_args = TrainerCliArgs()
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()
7 changes: 5 additions & 2 deletions tests/e2e/test_mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import unittest
from pathlib import Path

import pytest

from axolotl.cli import load_datasets
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
Expand All @@ -19,9 +21,10 @@
os.environ["WANDB_DISABLED"] = "true"


class TestMistral(unittest.TestCase):
@pytest.mark.skip(reason="skipping until upstreamed into transformers")
class TestMamba(unittest.TestCase):
"""
Test case for Llama models using LoRA
Test case for Mamba models
"""

@with_temp_dir
Expand Down
Loading

0 comments on commit 788649f

Please sign in to comment.