merge from main

intel · Jun 25, 2024 · a149a94 · a149a94
2 parents 88a4485 + 0ce383b
commit a149a94
Show file tree

Hide file tree

Showing 80 changed files with 2,974 additions and 835 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,46 @@
 # CHANGELOG
+# [Version v1.7.2](https://github.com/intel/xFasterTransformer/releases/tag/v1.7.2)
+v1.7.2 - Continuous batching feature supports Qwen 1.0 & hybrid data types.
+
+## Functionality
+- Add continuous batching support of Qwen 1.0 models.
+- Enable hybrid data types for continuous batching feature, including `BF16_FP16, BF16_INT8, BF16_W8A8, BF16_INT4, BF16_NF4, W8A8_INT8, W8A8_int4, W8A8_NF4`.
+
+## BUG fix
+- Fixed the convert fault in Baichuan1 models.
+
+# [Version v1.7.1](https://github.com/intel/xFasterTransformer/releases/tag/v1.7.1)
+v1.7.1 - Continuous batching feature supports ChatGLM2/3.
+
+## Functionality
+- Add continuous batching support of ChatGLM2/3 models.
+- Qwen2Convert supports quantized Qwen2 models by GPTQ, such as GPTQ-Int8 and GPTQ-Int4, by param `from_quantized_model="gptq"`.
+
+## BUG fix
+- Fixed the segament fault error when running with more than 2 ranks in vllm-xft serving.
+
+# [Version v1.7.0](https://github.com/intel/xFasterTransformer/releases/tag/v1.7.0)
+v1.7.0 - Continuous batching feature supported.
+
+## Functionality
+- Refactor framework to support continuous batching feature. `vllm-xft`, a fork of vllm, integrates the xFasterTransformer backend and maintains compatibility with most of the official vLLM's features.
+- Remove FP32 data type option of KV Cache.
+- Add `get_env()` python API to get recommended LD_PRELOAD set.
+- Add GPU build option for Intel Arc GPU series.
+- Exposed the interface of the LLaMA model, including Attention and decoder.
+
+## Performance
+- Update xDNN to release `v1.5.1`
+- Baichuan series models supports full FP16 pipline to improve performance.
+- More FP16 data type kernel added, including MHA, MLP, YARN rotary_embedding, rmsnorm and rope.
+- Kernel implementation of crossAttnByHead.
+
+## Dependency
+- Bump `torch` to `2.3.0`.
+
+## BUG fix
+- Fixed the segament fault error when running with more than 4 ranks.
+- Fixed the bugs of core dump && hang when running croos nodes.
 
 # [Version v1.6.0](https://github.com/intel/xFasterTransformer/releases/tag/v1.6.0)
 v1.6.0 - Llama3 and Qwen2 series models supported.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,7 +20,7 @@ project(xfastertransformer LANGUAGES C CXX)
 option(WITH_GPU "Build with GPU" OFF)
 if(WITH_GPU)
     message(STATUS "Notice: Building with GPU.")
-    add_definitions(-DGPU=true)
+    add_definitions(-DXFT_GPU=true)
     # Get compiler version
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version
                     OUTPUT_VARIABLE ICPX_VERSION
@@ -35,10 +35,6 @@ else()
     message(STATUS "Notice: GCC version: ${GCC_VERSION}")
 endif()
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release)
-endif()
-
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512bw -mavx512vl -fPIC")
 if(WITH_GPU)
@@ -73,10 +69,16 @@ if(GCC_VERSION VERSION_GREATER_EQUAL "10.1")
     endif()
 endif()
 
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
     message(STATUS "Notice: Using Debug mode.")
     set(CMAKE_C_FLAGS "-O0 -g")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g")
+    add_definitions(-DXFT_DEBUG=true)
+    add_definitions(-DSTEP_BY_STEP_ATTN=true)
 else()
     message(STATUS "Notice: Using Release mode.")
     set(CMAKE_C_FLAGS "-O2")
@@ -185,14 +187,14 @@ else()
     add_definitions(-DAVX512_FP16_WEIGHT_ONLY_INT4=true)
     add_definitions(-DAVX512_FP32_WEIGHT_ONLY_NF4=true)
     # add_definitions(-DAVX512_FP16_WEIGHT_ONLY_NF4=true)
+    # Enable AMX_FP16 optimization 
+    # add_definitions(-DAMX_FP16_WEIGHT_ONLY_FP16=true)
 endif()
 
-# add_definitions(-DDEBUG=true)
-# add_definitions(-DSTEP_BY_STEP_ATTN=true)
 add_definitions(-DUSE_SHM=true)
 option(XFT_BUILD_TESTS "Build xfastertransformer unit tests" OFF)
 if(XFT_BUILD_TESTS)
-add_definitions(-DUNDEBUG=true)
+    add_definitions(-DUNDEBUG=true)
 endif()
 
 # timeline event

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ xFasterTransformer is an exceptionally optimized solution for large language mod
     - [Built from source](#built-from-source)
       - [Prepare Environment](#prepare-environment)
         - [Manually](#manually)
+        - [Install dependent libraries](#install-dependent-libraries)
         - [How to build](#how-to-build)
   - [Models Preparation](#models-preparation)
   - [API usage](#api-usage)
@@ -34,6 +35,11 @@ xFasterTransformer is an exceptionally optimized solution for large language mod
         - [C++](#c)
   - [Web Demo](#web-demo)
   - [Serving](#serving)
+    - [vLLM](#vllm)
+      - [Install](#install)
+      - [OpenAI Compatible Server](#openai-compatible-server)
+    - [FastChat](#fastchat)
+    - [MLServer](#mlserver)
   - [Benchmark](#benchmark)
   - [Support](#support)
   - [Q\&A](#qa)
@@ -55,7 +61,8 @@ xFasterTransformer provides a series of APIs, both of C++ and Python, for end us
 |       Llama        | &#10004;  | &#10004; |   &#10004;   |
 |       Llama2       | &#10004;  | &#10004; |   &#10004;   |
 |       Llama3       | &#10004;  | &#10004; |   &#10004;   |
-|      Baichuan      | &#10004;  | &#10004; |   &#10004;   |
+|     Baichuan       | &#10004;  | &#10004; |   &#10004;   |
+|     Baichuan2      | &#10004;  | &#10004; |   &#10004;   |
 |        QWen        | &#10004;  | &#10004; |   &#10004;   |
 |        QWen2       | &#10004;  | &#10004; |   &#10004;   |
 | SecLLM(YaRN-Llama) | &#10004;  | &#10004; |   &#10004;   |
@@ -114,12 +121,12 @@ docker run -it \
 ### Built from source
 #### Prepare Environment
 ##### Manually
-- [PyTorch](https://pytorch.org/get-started/locally/) v2.0 (When using the PyTorch API, it's required, but it's not needed when using the C++ API.)
+- [PyTorch](https://pytorch.org/get-started/locally/) v2.3 (When using the PyTorch API, it's required, but it's not needed when using the C++ API.)
   ```bash 
   pip install torch --index-url https://download.pytorch.org/whl/cpu
   ```
 
-- For GPU, xFT needs ABI=1 from [torch==2.0.1+cpu.cxx11.abi](https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl#sha256=fbe35a5c60aef0c4b5463caab10ba905bdfa07d6d16b7be5d510225c966a0b46) in [torch-whl-list](https://download.pytorch.org/whl/torch/) due to DPC++ need ABI=1.
+- For GPU, xFT needs ABI=1 from [torch==2.3.0+cpu.cxx11.abi](https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.3.0%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl#sha256=c34512c3e07efe9b7fb5c3a918fef1a7c6eb8969c6b2eea92ee5c16a0583fe12) in [torch-whl-list](https://download.pytorch.org/whl/torch/) due to DPC++ need ABI=1.
 
 ##### Install dependent libraries
 
@@ -167,6 +174,7 @@ xFasterTransformer supports a different model format from Huggingface, but it's
     - ChatGLM3Convert
     - OPTConvert
     - BaichuanConvert
+    - Baichuan2Convert
     - QwenConvert
     - Qwen2Convert
     - DeepseekConvert
@@ -229,7 +237,10 @@ std::cout << std::endl;
 ```
 
 ## How to run
-Recommend preloading `libiomp5.so` to get a better performance. `libiomp5.so` file will be in `3rdparty/mklml/lib` directory after building xFasterTransformer successfully.
+Recommend preloading `libiomp5.so` to get a better performance. 
+- ***[Recommended]*** Run `export $(python -c 'import xfastertransformer as xft; print(xft.get_env())')` if xfastertransformer's python wheel package is installed.
+- `libiomp5.so` file will be in `3rdparty/mkl/lib` directory after building xFasterTransformer successfully if building from source code.
+
 ### Single rank
 FasterTransformer will automatically check the MPI environment, or you can use the `SINGLE_INSTANCE=1` environment variable to forcefully deactivate MPI.  
 
@@ -254,7 +265,9 @@ Use MPI to run in the multi-ranks mode, please install oneCCL firstly.
 
 - Here is a example on local. 
   ```bash
-  OMP_NUM_THREADS=48 LD_PRELOAD=libiomp5.so mpirun \
+  # or export LD_PRELOAD=libiomp5.so manually
+  export $(python -c 'import xfastertransformer as xft; print(xft.get_env())')
+  OMP_NUM_THREADS=48 mpirun \
     -n 1 numactl -N 0  -m 0 ${RUN_WORKLOAD} : \
     -n 1 numactl -N 1  -m 1 ${RUN_WORKLOAD} 
   ```
@@ -300,14 +313,65 @@ A web demo based on [Gradio](https://www.gradio.app/) is provided in repo. Now s
 - Run the script corresponding to the model. After the web server started, open the output URL in the browser to use the demo. Please specify the paths of model and tokenizer directory, and data type. `transformer`'s tokenizer is used to encode and decode text so `${TOKEN_PATH}` means the huggingface model directory. This demo also support multi-rank.
 ```bash
 # Recommend preloading `libiomp5.so` to get a better performance.
-# `libiomp5.so` file will be in `3rdparty/mklml/lib` directory after build xFasterTransformer.
-LD_PRELOAD=libiomp5.so python examples/web_demo/ChatGLM.py \
-                                    --dtype=bf16 \
-                                    --token_path=${TOKEN_PATH} \
-                                    --model_path=${MODEL_PATH}
+# or LD_PRELOAD=libiomp5.so manually, `libiomp5.so` file will be in `3rdparty/mkl/lib` directory after build xFasterTransformer.
+export $(python -c 'import xfastertransformer as xft; print(xft.get_env())')
+python examples/web_demo/ChatGLM.py \
+                      --dtype=bf16 \
+                      --token_path=${TOKEN_PATH} \
+                      --model_path=${MODEL_PATH}
 ```
 
 ## Serving
+### vLLM
+A fork of vLLM has been created to integrate the xFasterTransformer backend, maintaining compatibility with most of the official vLLM's features. Refer [this link](serving/vllm-xft.md) for more detail.
+
+#### Install
+```bash
+pip install vllm-xft
+```
+***Notice: Please do not install both `vllm-xft` and `vllm` simultaneously in the environment. Although the package names are different, they will actually overwrite each other.***
+
+#### OpenAI Compatible Server
+***Notice: Preload libiomp5.so is required!***
+```bash
+# Preload libiomp5.so by following cmd or LD_PRELOAD=libiomp5.so manually
+export $(python -c 'import xfastertransformer as xft; print(xft.get_env())')
+
+python -m vllm.entrypoints.openai.api_server \
+        --model ${MODEL_PATH} \
+        --tokenizer ${TOKEN_PATH} \
+        --dtype bf16 \
+        --kv-cache-dtype fp16 \
+        --served-model-name xft \
+        --port 8000 \
+        --trust-remote-code
+```
+For multi-rank mode, please use `python -m vllm.entrypoints.slave` as slave and keep params of slaves align with master.
+```bash
+# Preload libiomp5.so by following cmd or LD_PRELOAD=libiomp5.so manually
+export $(python -c 'import xfastertransformer as xft; print(xft.get_env())')
+
+OMP_NUM_THREADS=48 mpirun \
+        -n 1 numactl --all -C 0-47 -m 0 \
+          python -m vllm.entrypoints.openai.api_server \
+            --model ${MODEL_PATH} \
+            --tokenizer ${TOKEN_PATH} \
+            --dtype bf16 \
+            --kv-cache-dtype fp16 \
+            --served-model-name xft \
+            --port 8000 \
+            --trust-remote-code \
+        : -n 1 numactl --all -C 48-95 -m 1 \
+          python -m vllm.entrypoints.slave \
+            --dtype bf16 \
+            --model ${MODEL_PATH} \
+            --kv-cache-dtype fp16
+```
+
+### FastChat
+xFasterTransformer is an official inference backend of [FastChat](https://github.com/lm-sys/FastChat). Please refer to [xFasterTransformer in FastChat](https://github.com/lm-sys/FastChat/blob/main/docs/xFasterTransformer.md) and [FastChat's serving](https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md) for more details.
+
+### MLServer
 [A example serving of MLServer](serving/mlserver/README.md) is provided which supports REST and gRPC interface and adaptive batching feature to group inference requests together on the fly.
 
 ## [Benchmark](benchmark/README.md)