Enable text-generation with new API (#318)

* enable text-generation with NeuralChat API Signed-off-by: changwangss <chang1.wang@intel.com> * fix wrong typing and hide import Signed-off-by: changwangss <chang1.wang@intel.com> * improve import check * rebase main Signed-off-by: changwangss <chang1.wang@intel.com> * remove the outdated code Signed-off-by: changwangss <chang1.wang@intel.com> * update order * improve sqconfig and add ut Signed-off-by: changwangss <chang1.wang@intel.com> * refine woq Signed-off-by: changwangss <chang1.wang@intel.com> * fix mp name Signed-off-by: changwangss <chang1.wang@intel.com> * fix pylint Signed-off-by: changwangss <chang1.wang@intel.com> * fix import Signed-off-by: changwangss <chang1.wang@intel.com> * Fixed shape error for weight-only quantization op Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com> * Fixed UT error for weight-only quantization Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com> * improve the example Signed-off-by: changwangss <chang1.wang@intel.com> * Update README.md * fix long line Signed-off-by: changwangss <chang1.wang@intel.com> * fix import Signed-off-by: changwangss <chang1.wang@intel.com> * Update README.md * Update test_quantization.py --------- Signed-off-by: changwangss <chang1.wang@intel.com> Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com> Co-authored-by: Cheng, Penghui <penghui.cheng@intel.com>
intel · Sep 18, 2023 · f4dc780 · f4dc780
1 parent 2bbcf51
commit f4dc780
Show file tree

Hide file tree

Showing 16 changed files with 877 additions and 600 deletions.
diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -6,106 +6,101 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI
 
 # Prerequisite
 ## 1. Create Environment
-If you want to use Pytorch & Intel-extension-for-pytorch version 2.0.1, please 
-```
-pip install -r requirements.txt
-```
-If you want to use Pytorch & Intel-extension-for-pytorch version 2.1, the dependent packages are listed in requirements, we recommend create environment as the following steps.
+Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
 
 ```bash
-WORK_DIR=$PWD
-# GCC 12.3 is required, please set it firstly
-# Create environment (conda recommended)
 conda create -n llm python=3.9 -y
-# install deps, please try gcc, gxx 12.2 if 12.3 doesn't find from conda
-conda install gcc=12.3 gxx=12.3 cxx-compiler -c conda-forge -y
-conda install cmake ninja mkl mkl-include -y
-conda install gperftools -c conda-forge -y
-
-# Install PyTorch
-python -m pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230711%2Bcpu-cp39-cp39-linux_x86_64.whl
-
-# Install IPEX with semi-compiler, require gcc 12.3 or 12.2
-rm -rf llvm-project && mkdir llvm-project && cd llvm-project
-wget https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.6/cmake-16.0.6.src.tar.xz
-wget https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.6/llvm-16.0.6.src.tar.xz
-tar -xf cmake-16.0.6.src.tar.xz && mv cmake-16.0.6.src cmake
-tar -xf llvm-16.0.6.src.tar.xz && mv llvm-16.0.6.src llvm
-mkdir build && cd build
-cmake ../llvm -DCMAKE_INSTALL_PREFIX=${PWD}/_install/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
-make install -j$(nproc)
-ln -s ${PWD}/_install/llvm/bin/llvm-config ${CONDA_PREFIX}/bin/llvm-config-13
-cd ../../
-
-git clone --branch llm_feature_branch https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git submodule sync && git submodule update --init --recursive
-export DNNL_GRAPH_BUILD_COMPILER_BACKEND=1
-export CXXFLAGS="${CXXFLAGS} -D__STDC_FORMAT_MACROS"
-python setup.py install
-cd ../
-
-# disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other models don't need it.
-export _DNNL_DISABLE_COMPILER_BACKEND=1
-
-# Install neural-compressor
-git clone https://github.com/intel/neural-compressor.git
-cd  neural-compressor
+conda activate llm
+bash build_env.sh
+git clone https://github.com/intel/intel-extension-for-transformers.git
+cd intel-extension-for-transformers
 pip install -r requirements.txt
 python setup.py install
+```
+> Note:
+> Disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other > models don't need it.
+> `export _DNNL_DISABLE_COMPILER_BACKEND=1`
 
-# Install lm_eval
-pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
-# Install others deps
-pip install transformers optimum-intel cpuid accelerate datasets sentencepiece protobuf==3.20.3
-````
-We use the GPTJ defination script [modeling_gptj.py](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/modeling/gptj/modeling_gptj.py) in `run_generation.py`. Here is a little change to success trace.
-```diff
-# Line 602 in modeling_gptj.py on transformers 4.28.1
+> Note: If `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version.
+> ```bash
+> find $CONDA_PREFIX | grep libstdc++.so.6
+> export LD_PRELOAD=<the path of libstdc++.so.6>:${LD_PRELOAD}
+> ```
 
--   position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-+   position_ids = torch.arange(past_length, torch.tensor(input_shape[-1]) + torch.tensor(past_length), dtype=torch.long, device=device)
-```
-The changes for `llama` series models in `modeling_llama.py`, `dolly_v2_3b` series models in `modeling_gpt_neox.py`， `bloom` series models in `modeling_bloom.py` and `opt` series models in `modeling_opt.py` are similar to the above.
 
 
 # Run
+We support compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `RTN/AWQ/TEQ/GPTQ` algorithms, `BitsAndBytes` based transformers also works, the followings are command to show how to use it.
 
-## 1. Quantization
+## 1. Performance
 ``` bash
-# --int8 is used for int8 only.
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
-python run_generation.py \
-    --model EleutherAI/gpt-j-6b \
-    --quantize \
-    --sq \
-    --alpha 1.0 \
-    --int8_bf16_mixed \
-    --ipex
-```
-## 2. Performance
-```bash
-# --int8 is used for int8 only.
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
 export KMP_BLOCKTIME=1
 export KMP_SETTINGS=1
 export KMP_AFFINITY=granularity=fine,compact,1,0
 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+# fp32
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --benchmark
+# mixedprecision
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --mixed_precision \
+    --benchmark
+# smoothquant
+# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --sq \
+    --alpha 1.0 \
+    --int8 \
+    --benchmark
+# weightonlyquant
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
     --model EleutherAI/gpt-j-6b \
-    --benchmark \
-    --int8_bf16_mixed \
-    --ipex
+    --woq \
+    --benchmark
+# bitsandbytes
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --bitsandbytes \
+    --benchmark
+
 ```
-## 3. Accuracy
+
+## 2. Accuracy
 ```bash
-# --int8 is used for int8 only.
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
+# fp32
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --accuracy \
+    --tasks "lambada_openai"
+# mixedprecision
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --mixed_precision \
+    --accuracy \
+    --tasks "lambada_openai"
+# smoothquant
+# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --sq \
+    --alpha 1.0 \
+    --int8 \
+    --accuracy \
+    --tasks "lambada_openai"
+# weightonlyquant
 python run_generation.py \
-   --model EleutherAI/gpt-j-6b \
-   --accuracy \
-   --int8_bf16_mixed \
-   --ipex \
-   --tasks "lambada_openai"
+    --model EleutherAI/gpt-j-6b \
+    --woq \
+    --accuracy \
+    --tasks "lambada_openai"
+# bitsandbytes
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --bitsandbytes \
+    --accuracy \
+    --tasks "lambada_openai"
 ```
diff --git a/examples/huggingface/pytorch/text-generation/quantization/build_env.sh b/examples/huggingface/pytorch/text-generation/quantization/build_env.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+  2 set -x
+  3 set -e
+  4
+  5 VER_LLVM="llvmorg-16.0.6"
+  6 VER_IPEX="7256d0848ba81bb802dd33fca0e33049a751db58"
+  7
+  8 # Check existance of required Linux commands
+  9 for CMD in conda git nproc make; do
+ 10     command -v ${CMD} || (echo "Error: Command \"${CMD}\" not found." ; exit 4)
+ 11 done
+ 12
+ 13 MAX_JOBS_VAR=$(nproc)
+ 14 if [ ! -z "${MAX_JOBS}" ]; then
+ 15     MAX_JOBS_VAR=${MAX_JOBS}
+ 16 fi
+ 17
+ 18 # Save current directory path
+ 19 BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ 20 cd ${BASEFOLDER}
+ 21 # Checkout individual components
+ 22 if [ ! -d llvm-project ]; then
+ 23     git clone https://github.com/llvm/llvm-project.git
+ 24 fi
+ 25 if [ ! -d intel-extension-for-pytorch ]; then
+ 26     git clone https://github.com/intel/intel-extension-for-pytorch.git
+ 27 fi
+ 28
+ 29 # Checkout required branch/commit and update submodules
+ 30 cd llvm-project
+ 31 if [ ! -z ${VER_LLVM} ]; then
+ 32     git checkout ${VER_LLVM}
+ 33 fi
+ 34 git submodule sync
+ 35 git submodule update --init --recursive
+ 36 cd ..
+ 37 cd intel-extension-for-pytorch
+ 38 if [ ! -z ${VER_IPEX} ]; then
+ 39     git checkout ${VER_IPEX}
+ 40 fi
+ 41 git submodule sync
+ 42 git submodule update --init --recursive
+ 43 cd ..
+ 44
+ 45 # Install dependencies
+ 46 conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
+ 47 conda update -y sysroot_linux-64
+ 48 python -m pip install cmake
+ 49 python -m pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230711%2Bcpu-cp39-cp39-linux_x86_64.whl
+ 50 ABI=$(python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
+ 51
+ 52 # Compile individual component
+ 53 export CC=${CONDA_PREFIX}/bin/gcc
+ 54 export CXX=${CONDA_PREFIX}/bin/g++
+ 55 export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+
+#  LLVM
+cd llvm-project
+LLVM_ROOT="$(pwd)/release"
+if [ -d ${LLVM_ROOT} ]; then
+    rm -rf ${LLVM_ROOT}
+fi
+if [ -d build ]; then
+    rm -rf build
+fi
+mkdir build
+cd build
+echo "***************************** cmake *****************************" > ../build.log
+cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=${ABI}" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF ../llvm 2>&1 | tee -a ../build.log
+echo "***************************** build *****************************" >> ../build.log
+cmake --build . -j ${MAX_JOBS_VAR} 2>&1 | tee -a ../build.log
+echo "**************************** install ****************************" >> ../build.log
+cmake -DCMAKE_INSTALL_PREFIX=${LLVM_ROOT} -P cmake_install.cmake 2>&1 | tee -a ../build.log
+#xargs rm -rf < install_manifest.txt
+cd ..
+rm -rf build
+ln -s ${LLVM_ROOT}/bin/llvm-config ${LLVM_ROOT}/bin/llvm-config-13
+export PATH=${LLVM_ROOT}/bin:$PATH
+export LD_LIBRARY_PATH=${LLVM_ROOT}/lib:$LD_LIBRARY_PATH
+cd ..
+#  Intel® Extension for PyTorch*
+cd intel-extension-for-pytorch
+python -m pip install -r requirements.txt
+export LLVM_DIR=${LLVM_ROOT}/lib/cmake/llvm
+export DNNL_GRAPH_BUILD_COMPILER_BACKEND=1
+CXXFLAGS_BK=${CXXFLAGS}
+export CXXFLAGS="${CXXFLAGS} -D__STDC_FORMAT_MACROS"
+python setup.py clean
+python setup.py bdist_wheel 2>&1 | tee build.log
+export CXXFLAGS=${CXXFLAGS_BK}
+unset DNNL_GRAPH_BUILD_COMPILER_BACKEND
+unset LLVM_DIR
+python -m pip install --force-reinstall dist/*.whl
+cd ..
+
+# Sanity Test
+set +x
+export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so
+echo "Note: Should you experience \"version \`GLIBCXX_N.N.NN' not found\" error, run command \"export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so\" and try again."
+python -c "import torch; import intel_extension_for_pytorch as ipex; print(f'torch_cxx11_abi:     {torch._C._GLIBCXX_USE_CXX11_ABI}'); print(f'torch_version:       {torch.__version__}'); print(f'ipex_version:        {ipex.__version__}');"
+# Install neural-compressor
+git clone https://github.com/intel/neural-compressor.git
+cd  neural-compressor
+pip install -r requirements.txt
+python setup.py install
+cd ..
+
+# Install intel-extension-for-pytorch
+git checkout -b int8_llama2
+pip install -r requirements.txt
+python setup.py install
+cd ..
+
+# Install lm_eval
+pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
+# Install others deps
+pip install transformers optimum-intel cpuid accelerate datasets sentencepiece protobuf==3.20.3
+
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt
@@ -1,5 +1,6 @@
 accelerate
 datasets >= 2.0
+peft
 protobuf
 sentencepiece != 0.1.92
 torch >= 1.10.0