mosaicml · vchiley · May 24, 2023 · May 20, 2023 · May 22, 2023 · May 22, 2023
@@ -19,8 +19,12 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'cpu'
-          container: mosaicml/pytorch:latest
+        - name: 'cpu-latest'
+          container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          markers: 'not gpu'
+          pytest_command: 'coverage run -m pytest'
+        - name: 'cpu-2.0.1'
+          container: mosaicml/pytorch:2.0.1_cu117-python3.10-ubuntu20.04
           markers: 'not gpu'
           pytest_command: 'coverage run -m pytest'
     name: ${{ matrix.name }}

@@ -4,7 +4,7 @@ on:
     branches:
     - main
     - release/*
-  pull_request_target:
+  pull_request:
     branches:
     - main
     - release/**
@@ -19,8 +19,12 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'gpu'
-          container: mosaicml/pytorch:latest
+        - name: 'gpu-latest'
+          container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
+        - name: 'gpu-2.0.1'
+          container: mosaicml/pytorch:2.0.1_cu117-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
     name: ${{ matrix.name }}

@@ -32,10 +32,11 @@ jobs:
           PYPI_PACKAGE_NAME="llm-foundry-test-$(date +%Y%m%d%H%M%S)"
         fi
 
-        # Remove the xentropy-cuda-lib dependency as PyPI does not support direct installs. The
-        # error message for importing FusedCrossEntropy gives instructions on how to install if a
-        # user tries to use it without this dependency.
+        # Remove the xentropy-cuda-lib and triton-pre-mlir dependencies as PyPI does not support
+        # direct installs. The error message for importing FusedCrossEntropy gives instructions
+        # on how to install if a user tries to use it without this dependency.
         sed '/xentropy-cuda-lib@git+https:\/\/github.com\/HazyResearch\/flash-attention.git@.*/d' -i setup.py
+        sed '/triton-pre-mlir@git+https:\/\/github.com\/vchiley\/triton.git@.*/d' -i setup.py
 
         python -m pip install --upgrade build twine
         python -m build

@@ -1,5 +1,6 @@
 default_language_version:
   python: python3
+exclude: llmfoundry/models/layers/flash_attn_triton.py
 repos:
 - repo: https://github.com/google/yapf
   rev: v0.32.0

@@ -76,6 +76,8 @@ Here's what you need to get started with our LLM stack:
 
 # Installation
 
+This assumes you already have PyTorch and CMake installed.
+
 To get started, clone this repo and install the requirements:
 
 <!--pytest.mark.skip-->

@@ -43,10 +43,12 @@ class ComposerHFCausalLM(HuggingFaceModelWithZLoss):
     """
 
     def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
+        trust_remote_code = om_model_config.get('trust_remote_code', True)
+        use_auth_token = om_model_config.get('use_auth_token', False)
         config = AutoConfig.from_pretrained(
             om_model_config.pretrained_model_name_or_path,
-            trust_remote_code=om_model_config.get('trust_remote_code', True),
-            use_auth_token=om_model_config.get('use_auth_token', False),
+            trust_remote_code=trust_remote_code,
+            use_auth_token=use_auth_token,
         )
 
         # set config overrides
@@ -87,19 +89,24 @@ def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
             if om_model_config.pretrained:
                 model = AutoModelForCausalLM.from_pretrained(
                     om_model_config.pretrained_model_name_or_path,
-                    trust_remote_code=om_model_config.get(
-                        'trust_remote_code', True),
-                    use_auth_token=om_model_config.get('use_auth_token', False),
+                    trust_remote_code=trust_remote_code,
+                    use_auth_token=use_auth_token,
                     config=config)
             else:
-                model = AutoModelForCausalLM.from_config(config)
+                model = AutoModelForCausalLM.from_config(
+                    config,
+                    trust_remote_code=trust_remote_code,
+                )
         elif init_device == 'meta':
             if om_model_config.pretrained:
                 raise ValueError(
                     'Setting cfg.pretrained=True is not supported when init_device="meta".'
                 )
             with init_empty_weights(include_buffers=False):
-                model = AutoModelForCausalLM.from_config(config)
+                model = AutoModelForCausalLM.from_config(
+                    config,
+                    trust_remote_code=trust_remote_code,
+                )
         else:
             raise ValueError(
                 f'init_device="{init_device}" must be either "cpu" or "meta".')

@@ -10,6 +10,7 @@
 import torch
 import torch.nn as nn
 from einops import rearrange
+from packaging import version
 from torch import nn
 
 from llmfoundry.models.layers.norm import LPLayerNorm
@@ -207,11 +208,27 @@ def triton_flash_attn_fn(
     multiquery=False,
 ):
     try:
-        from flash_attn import flash_attn_triton  # type: ignore
+        from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
     except:
-        raise RuntimeError(
-            'Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202'
-        )
+        _installed = False
+        if version.parse(torch.__version__) < version.parse('2.0.0'):
+            _installed = True
+            # if torch1.13.1 revert to using triton flash attn from HazyResearch
+            # with flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202
+            try:
+                from flash_attn.flash_attn_triton import flash_attn_func
+            except:
+                _installed = False
+        if not _installed:
+            # installing triton-pre-mlir works for both torch1.13.1 and torch2.0+
+            # default recommendation is to install this variant
+            raise RuntimeError(
+                'Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU '
+                'and `pip install .[gpu]` if installing from llm-foundry source or '
+                '`pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` '
+                'if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). '
+                'Note: (1) requires you have CMake and PyTorch already installed.'
+            )
 
     check_valid_inputs(query, key, value)
 
@@ -257,9 +274,8 @@ def triton_flash_attn_fn(
         value = value.expand(*value.shape[:2], n_heads, value.size(-1))
 
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_triton.flash_attn_func(query, key, value,
-                                                    attn_bias, reset_is_causal,
-                                                    softmax_scale)
+    attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal,
+                                  softmax_scale)
 
     output = attn_output.view(*attn_output.shape[:2], -1)