[CPP Graph] Asym model (#306)

intel · Sep 15, 2023 · 93ca550 · 93ca550
1 parent fcd59ab
commit 93ca550
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 8 deletions.
diff --git a/.github/workflows/script/models/cpp_graph_inference.sh b/.github/workflows/script/models/cpp_graph_inference.sh
@@ -44,7 +44,7 @@ function main() {
         infer_cmd="./build/bin/run_gptj"
         model_name="EleutherAI/gpt-j-6b"
         input_model="/tf_dataset2/models/pytorch/gpt-j-6B"
-        precision_list=("q4_j_b128")
+        precision_list=("q4_j_b128", "q4_j_b128_asym")
     elif [[ "${model}" == "starcoder-3b" ]]; then
         convert_script="${working_dir}/scripts/convert_starcoder.py"
         quant_script="./build/bin/quant_starcoder"
@@ -119,6 +119,8 @@ function main() {
                           ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --scale_dtype fp32 --compute_type fp32 --alg sym
                       elif [[ ${precision} == "q4_j_b128" ]]; then
                           ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg sym
+                      elif [[ ${precision} == "q4_j_b128_asym" ]]; then
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg asym
                       elif [[ ${precision} == "q4_0" ]]; then
                           ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --compute_type ggml --alg sym
                       elif [[ ${precision} == "q4_1" ]]; then

diff --git a/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_weight_compression.h b/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_weight_compression.h
@@ -1280,11 +1280,11 @@ class WeightF4ScaleFp32 : public WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>
  public:
   using Param = typename WeightS8ScaleFp32<_GemmCore_T, ISA_T>::Param;
   using StorageWeight = StorageWeightF4ScaleFp32;
-  PackedWeight* createStorage(const int N, const int K, int blocksize, bool is_sym = true) override {
+  PackedWeight* createStorage(const int N, const int K, int blocksize) {
     int KPad = utils::padto(K, _GemmCore_T::KTILE);
     int NPad = utils::padto(N, _GemmCore_T::NTILE);
     auto ptr = new StorageWeight(_GemmCore_T::TYPE, F4_T);
-    ptr->resize(NPad, KPad, blocksize <= 0 ? K : blocksize, is_sym);
+    ptr->resize(NPad, KPad, blocksize <= 0 ? K : blocksize);
     return ptr;
   }
 
@@ -1334,6 +1334,26 @@ class WeightF4ScaleFp32 : public WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>
     assert(false);
     return JblasInvalidParam;
   }
+  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
+                           PackedWeight* ptr) {
+    auto stor = dynamic_cast<StorageWeight*>(ptr);
+    if (stor) {
+      int rawnk_scale = utils::updiv(K, stor->mBlockSize);
+      int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
+#pragma omp parallel for
+      for (int i = 0; i < nk_scale; i++) {  // padding copy
+        if (i < rawnk_scale) {
+          std::memcpy(stor->mSPtr + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
+        } else {
+          std::memset(stor->mSPtr + i * stor->mNPad, 0, stor->mNPad * sizeof(stor->mSPtr[0]));
+        }
+      }
+      utils::avector<int8_t> reorded(stor->mKPad * stor->mNPad);
+      WeightS8ScaleFp32<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded.data());
+      WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>::compressWeight(stor->mNPad, stor->mKPad, reorded.data(),
+                                                                     stor->mNPad, stor->mWPtr);
+    }
+  }
 
  protected:
   virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h
@@ -23,6 +23,12 @@
 #include <cstdio>
 #endif
 
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define NE_MEM_ALIGN 4
+#else
+#define NE_MEM_ALIGN 16
+#endif
+
 #include "core/ne_layers.h"
 #include "models/model_utils/util.h"
 #include "models/models.h"
@@ -493,9 +499,15 @@ struct model_model_loader {
 
   void calc_sizes(size_t* ctx_size_p, size_t* mmapped_size_p) const {
     *ctx_size_p = *mmapped_size_p = 0;
+    size_t size_needed = 0;
     for (const model_load_tensor& lt : tensors_map.tensors) {
       *ctx_size_p += sizeof(struct ne_tensor) + NE_OBJECT_SIZE;
-      *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+      if (lt.type == NE_TYPE_JBLAS) {
+        size_needed = lt.size;
+      } else {
+        size_needed = (lt.size + NE_MEM_ALIGN - 1) / NE_MEM_ALIGN * NE_MEM_ALIGN;
+      }
+      *(use_mmap ? mmapped_size_p : ctx_size_p) += size_needed;
     }
   }
 

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
@@ -796,6 +796,9 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
   if (params.bits == quant_bits::q4) {
     if (params.scale_dtype == quant_sdtype::fp32) {
       if (params.compute_type == quant_comp::int8) {
+        if (params.alg != quant_alg::sym) {
+          printf("Current not support asymmetric int8 computation, reset to symmetric\n");
+        }
         if (params.block_size == -1) {
           using Kernel = WeiS4ClipFp32PerN<GcCompInt8, JblasAVX512F>;
           using KernelRef = WeiS4ClipFp32PerN<GcCompInt8, JblasNoSIMD>;
@@ -824,7 +827,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS4ClipFp32<GcCompFp32, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size);
+        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
         if (cd->AVX512_FP16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
@@ -835,7 +838,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS4ClipFp32<GcCompBf16, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size);
+        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
         if (cd->AMX_BF16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
@@ -848,6 +851,9 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
     // TODO add 8bit quantization
     if (params.scale_dtype == quant_sdtype::fp32) {
       if (params.compute_type == quant_comp::int8) {
+        if (params.alg != quant_alg::sym) {
+          printf("Current not support asymmetric int8 computation, reset to symmetric\n");
+        }
         if (params.block_size == -1) {
           using Kernel = WeiS8Fp32PerN<GcCompInt8, JblasAVX512F>;
           using KernelRef = WeiS8Fp32PerN<GcCompInt8, JblasNoSIMD>;
@@ -876,7 +882,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS8Fp32<GcCompFp32, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size);
+        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
         if (cd->AVX512_FP16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
@@ -887,7 +893,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS8Fp32<GcCompBf16, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size);
+        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
         if (cd->AMX_BF16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {