add intel xpu support for TGI (huggingface#1475)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
martinigoyanes · Apr 30, 2024 · 08f92bd · 08f92bd
1 parent 965dfcb
commit 08f92bd
Show file tree

Hide file tree

Showing 18 changed files with 433 additions and 76 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -274,12 +274,105 @@ jobs:
           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
 
+  build-and-push-image-intel:
+    concurrency:
+      group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    needs:
+      - start-runner
+      - build-and-push-image # Wait for the main docker image to be built
+      - integration-tests # Wait for the main integration-tests
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    permissions:
+      contents: write
+      packages: write
+      # This is used to complete the identity challenge
+      # with sigstore/fulcio when running outside of PRs.
+      id-token: write
+      security-events: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Initialize Docker Buildx
+        uses: docker/setup-buildx-action@v2.0.0
+        with:
+          install: true
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Tailscale
+        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
+        with:
+          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+      - name: Login to GitHub Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to internal Container Registry
+        uses: docker/login-action@v2.1.0
+        with:
+          username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
+          password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
+          registry: registry.internal.huggingface.tech
+      - name: Login to Azure Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v2.1.0
+        with:
+          username: ${{ secrets.AZURE_DOCKER_USERNAME }}
+          password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
+          registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
+      # If pull request
+      - name: Extract metadata (tags, labels) for Docker
+        if: ${{ github.event_name == 'pull_request' }}
+        id: meta-pr
+        uses: docker/metadata-action@v4.3.0
+        with:
+          images: |
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+          tags: |
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
+      # If main, release or tag
+      - name: Extract metadata (tags, labels) for Docker
+        if: ${{ github.event_name != 'pull_request' }}
+        id: meta
+        uses: docker/metadata-action@v4.3.0
+        with:
+          flavor: |
+            latest=false
+          images: |
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            ghcr.io/huggingface/text-generation-inference
+            db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
+          tags: |
+            type=semver,pattern={{version}}-intel
+            type=semver,pattern={{major}}.{{minor}}-intel
+            type=raw,value=latest-intel,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile_intel
+          push: true
+          platforms: 'linux/amd64'
+          build-args: |
+            GIT_SHA=${{ env.GITHUB_SHA }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-intel
+          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
+          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
+          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
+
   stop-runner:
     name: Stop self-hosted EC2 runner
     needs:
       - start-runner
       - build-and-push-image
       - build-and-push-image-rocm
+      - build-and-push-image-intel
       - integration-tests
     runs-on: ubuntu-latest
     env:

diff --git a/Dockerfile_intel b/Dockerfile_intel
@@ -0,0 +1,105 @@
+FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef as planner
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --release
+
+
+# Text Generation Inference base image for Intel
+FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base
+
+USER root
+# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+
+WORKDIR /usr/src
+# Build pytorch and ipex
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main
+RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_cuda.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
+ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
+ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh
+ENV CCL_CONFIGURATION=cpu_gpu_dpcpp
+ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man
+ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
+ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
+ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
+ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so
+ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
+ENV MKLROOT=/opt/intel/oneapi/mkl/latest
+ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N
+ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
+ENV CCL_ZE_IPC_EXCHANGE=sockets
+
+
+RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install
+RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+# Final image
+FROM base
+
+ENTRYPOINT ["text-generation-launcher"]
+CMD ["--json-output"]
diff --git a/launcher/src/env_runtime.rs b/launcher/src/env_runtime.rs
@@ -7,14 +7,17 @@ pub(crate) struct Env {
     git_sha: &'static str,
     docker_label: &'static str,
     nvidia_env: String,
+    xpu_env: String,
 }
 
 impl Env {
     pub fn new() -> Self {
         let nvidia_env = nvidia_smi();
+        let xpu_env = xpu_smi();
 
         Self {
             nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
+            xpu_env: xpu_env.unwrap_or("N/A".to_string()),
             cargo_target: env!("VERGEN_CARGO_TARGET_TRIPLE"),
             cargo_version: env!("VERGEN_RUSTC_SEMVER"),
             git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
@@ -31,7 +34,8 @@ impl fmt::Display for Env {
         writeln!(f, "Cargo version: {}", self.cargo_version)?;
         writeln!(f, "Commit sha: {}", self.git_sha)?;
         writeln!(f, "Docker label: {}", self.docker_label)?;
-        write!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
+        writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
+        write!(f, "xpu-smi:\n{}", self.xpu_env)?;
 
         Ok(())
     }
@@ -43,3 +47,10 @@ fn nvidia_smi() -> Option<String> {
     let output = nvidia_smi.replace('\n', "\n   ");
     Some(output.trim().to_string())
 }
+
+fn xpu_smi() -> Option<String> {
+    let output = Command::new("xpu-smi").arg("discovery").output().ok()?;
+    let xpu_smi = String::from_utf8(output.stdout).ok()?;
+    let output = xpu_smi.replace('\n', "\n   ");
+    Some(output.trim().to_string())
+}
diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py
@@ -2,6 +2,7 @@
 import torch
 
 from typing import Optional, List, Tuple
+from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
 
 BLOCK_SIZE: int = 16
 # Will be set in warmup
@@ -24,7 +25,10 @@ def __init__(
         self.repeat_slots = repeat_slots
 
         element_size = torch.tensor([], dtype=dtype).element_size()
-        x = self.block_size // element_size
+        if IS_XPU_SYSTEM:
+            x = 1
+        else:
+            x = self.block_size // element_size
 
         self.kv_cache = [
             (

diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -21,8 +21,10 @@
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
 from loguru import logger
+from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
 
-from vllm.model_executor.layers.fused_moe import fused_moe
+if not IS_XPU_SYSTEM:
+    from vllm.model_executor.layers.fused_moe import fused_moe
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.layers import (
     FastLinear,

diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -24,7 +24,10 @@
 import numpy as np
 
 from torch import nn
-from vllm.model_executor.layers.fused_moe import fused_moe
+from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+
+if not IS_XPU_SYSTEM:
+    from vllm.model_executor.layers.fused_moe import fused_moe
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -33,7 +33,7 @@
 from text_generation_server.utils.dist import MEMORY_FRACTION
 
 tracer = trace.get_tracer(__name__)
-
+from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM, IS_XPU_SYSTEM
 
 @dataclass
 class FlashCausalLMBatch(Batch):
@@ -752,7 +752,10 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
 
     def warmup(self, batch: FlashCausalLMBatch):
         # The warmup batch is the biggest batch we could ever receive
-        torch.cuda.empty_cache()
+        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
+            torch.cuda.empty_cache()
+        elif IS_XPU_SYSTEM:
+            torch.xpu.empty_cache()
         try:
             cache_manager = set_cache_manager(
                 batch.blocks,
@@ -772,20 +775,29 @@ def warmup(self, batch: FlashCausalLMBatch):
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
-        torch.cuda.synchronize(self.device)
+        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
+            torch.cuda.synchronize(self.device)
+        elif IS_XPU_SYSTEM:
+            torch.xpu.synchronize(self.device)
 
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
         # Calculate the number of blocks that can be allocated with the free memory
         dtype_size = torch.tensor([], dtype=self.dtype).element_size()
         cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
         total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
 
-        total_free_memory, _ = torch.cuda.mem_get_info(self.device)
-        total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
+        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
+            total_free_memory, _ = torch.cuda.mem_get_info(self.device)
+            total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
 
-        free_memory = max(
-            0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
-        )
+            free_memory = max(
+                0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
+            )
+        elif IS_XPU_SYSTEM:
+            total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory
+            free_memory = int(total_gpu_memory *0.5)
+        else:
+            raise NotImplementedError("FlashModel is only available on GPU")
 
         num_blocks = (
             # Leave 5% for some wiggle room

diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
@@ -18,6 +18,7 @@
 
 tracer = trace.get_tracer(__name__)
 
+from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
 
 class FlashLlama(FlashCausalLM):
     def __init__(
@@ -33,6 +34,9 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
+        elif IS_XPU_SYSTEM:
+            device = torch.device(f"xpu:{rank}")
+            dtype = torch.float16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashLlama is only available on GPU")
 

diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
@@ -33,8 +33,9 @@
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
 SLIDING_WINDOW_BLOCKS: Optional[int] = None
+from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
 
-MEM_POOL = torch.cuda.graph_pool_handle()
+MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 
 
 def set_sliding_window(sliding_window: int, sliding_window_blocks: int):
@@ -316,6 +317,9 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
+        elif IS_XPU_SYSTEM:
+            device = torch.device(f"xpu:{rank}")
+            dtype = torch.float16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashMistral is only available on GPU")