Skip to content

Commit

Permalink
[CPP Graph] Asym model (#306)
Browse files Browse the repository at this point in the history
  • Loading branch information
airMeng committed Sep 15, 2023
1 parent fcd59ab commit 93ca550
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 8 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/script/models/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ function main() {
infer_cmd="./build/bin/run_gptj"
model_name="EleutherAI/gpt-j-6b"
input_model="/tf_dataset2/models/pytorch/gpt-j-6B"
precision_list=("q4_j_b128")
precision_list=("q4_j_b128", "q4_j_b128_asym")
elif [[ "${model}" == "starcoder-3b" ]]; then
convert_script="${working_dir}/scripts/convert_starcoder.py"
quant_script="./build/bin/quant_starcoder"
Expand Down Expand Up @@ -119,6 +119,8 @@ function main() {
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --scale_dtype fp32 --compute_type fp32 --alg sym
elif [[ ${precision} == "q4_j_b128" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg sym
elif [[ ${precision} == "q4_j_b128_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg asym
elif [[ ${precision} == "q4_0" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --compute_type ggml --alg sym
elif [[ ${precision} == "q4_1" ]]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1280,11 +1280,11 @@ class WeightF4ScaleFp32 : public WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>
public:
using Param = typename WeightS8ScaleFp32<_GemmCore_T, ISA_T>::Param;
using StorageWeight = StorageWeightF4ScaleFp32;
PackedWeight* createStorage(const int N, const int K, int blocksize, bool is_sym = true) override {
PackedWeight* createStorage(const int N, const int K, int blocksize) {
int KPad = utils::padto(K, _GemmCore_T::KTILE);
int NPad = utils::padto(N, _GemmCore_T::NTILE);
auto ptr = new StorageWeight(_GemmCore_T::TYPE, F4_T);
ptr->resize(NPad, KPad, blocksize <= 0 ? K : blocksize, is_sym);
ptr->resize(NPad, KPad, blocksize <= 0 ? K : blocksize);
return ptr;
}

Expand Down Expand Up @@ -1334,6 +1334,26 @@ class WeightF4ScaleFp32 : public WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>
assert(false);
return JblasInvalidParam;
}
virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
PackedWeight* ptr) {
auto stor = dynamic_cast<StorageWeight*>(ptr);
if (stor) {
int rawnk_scale = utils::updiv(K, stor->mBlockSize);
int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
#pragma omp parallel for
for (int i = 0; i < nk_scale; i++) { // padding copy
if (i < rawnk_scale) {
std::memcpy(stor->mSPtr + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
} else {
std::memset(stor->mSPtr + i * stor->mNPad, 0, stor->mNPad * sizeof(stor->mSPtr[0]));
}
}
utils::avector<int8_t> reorded(stor->mKPad * stor->mNPad);
WeightS8ScaleFp32<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded.data());
WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>::compressWeight(stor->mNPad, stor->mKPad, reorded.data(),
stor->mNPad, stor->mWPtr);
}
}

protected:
virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
#include <cstdio>
#endif

#if UINTPTR_MAX == 0xFFFFFFFF
#define NE_MEM_ALIGN 4
#else
#define NE_MEM_ALIGN 16
#endif

#include "core/ne_layers.h"
#include "models/model_utils/util.h"
#include "models/models.h"
Expand Down Expand Up @@ -493,9 +499,15 @@ struct model_model_loader {

void calc_sizes(size_t* ctx_size_p, size_t* mmapped_size_p) const {
*ctx_size_p = *mmapped_size_p = 0;
size_t size_needed = 0;
for (const model_load_tensor& lt : tensors_map.tensors) {
*ctx_size_p += sizeof(struct ne_tensor) + NE_OBJECT_SIZE;
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
if (lt.type == NE_TYPE_JBLAS) {
size_needed = lt.size;
} else {
size_needed = (lt.size + NE_MEM_ALIGN - 1) / NE_MEM_ALIGN * NE_MEM_ALIGN;
}
*(use_mmap ? mmapped_size_p : ctx_size_p) += size_needed;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,9 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
if (params.bits == quant_bits::q4) {
if (params.scale_dtype == quant_sdtype::fp32) {
if (params.compute_type == quant_comp::int8) {
if (params.alg != quant_alg::sym) {
printf("Current not support asymmetric int8 computation, reset to symmetric\n");
}
if (params.block_size == -1) {
using Kernel = WeiS4ClipFp32PerN<GcCompInt8, JblasAVX512F>;
using KernelRef = WeiS4ClipFp32PerN<GcCompInt8, JblasNoSIMD>;
Expand Down Expand Up @@ -824,7 +827,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
using KernelRef = WeiS4ClipFp32<GcCompFp32, JblasNoSIMD>;
static Kernel kernel;
static Kernel kernelref;
packedw = kernel.createStorage(n, k, params.block_size);
packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
if (cd->AVX512_FP16()) {
kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
} else {
Expand All @@ -835,7 +838,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
using KernelRef = WeiS4ClipFp32<GcCompBf16, JblasNoSIMD>;
static Kernel kernel;
static Kernel kernelref;
packedw = kernel.createStorage(n, k, params.block_size);
packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
if (cd->AMX_BF16()) {
kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
} else {
Expand All @@ -848,6 +851,9 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
// TODO add 8bit quantization
if (params.scale_dtype == quant_sdtype::fp32) {
if (params.compute_type == quant_comp::int8) {
if (params.alg != quant_alg::sym) {
printf("Current not support asymmetric int8 computation, reset to symmetric\n");
}
if (params.block_size == -1) {
using Kernel = WeiS8Fp32PerN<GcCompInt8, JblasAVX512F>;
using KernelRef = WeiS8Fp32PerN<GcCompInt8, JblasNoSIMD>;
Expand Down Expand Up @@ -876,7 +882,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
using KernelRef = WeiS8Fp32<GcCompFp32, JblasNoSIMD>;
static Kernel kernel;
static Kernel kernelref;
packedw = kernel.createStorage(n, k, params.block_size);
packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
if (cd->AVX512_FP16()) {
kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
} else {
Expand All @@ -887,7 +893,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
using KernelRef = WeiS8Fp32<GcCompBf16, JblasNoSIMD>;
static Kernel kernel;
static Kernel kernelref;
packedw = kernel.createStorage(n, k, params.block_size);
packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
if (cd->AMX_BF16()) {
kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
} else {
Expand Down

0 comments on commit 93ca550

Please sign in to comment.