Skip to content

Commit

Permalink
[CPP Graph] Baichuan & Baichuan2 Enabling (#376)
Browse files Browse the repository at this point in the history
* Enable Baichan and Baichuan2 in LLM Runtime
  • Loading branch information
Zhenzhong1 authored Oct 13, 2023
1 parent bb9f35a commit 98e5f9a
Show file tree
Hide file tree
Showing 24 changed files with 861 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ LLM Runtime supports the following models:
|[BLOOM-7B](https://huggingface.co/bigscience/bloomz-7b1)|||
|[OPT-125m](https://huggingface.co/facebook/opt-125m), [OPT-350m](https://huggingface.co/facebook/opt-350m), [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b), [OPT-13B](https://huggingface.co/facebook/opt-13b)|||
|[ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b), [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b)|||
|[Baichuan-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat), [Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)|||

### Code Generation
| model name | INT8 | INT4|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def __import_package(self, model_name):
import intel_extension_for_transformers.llm.runtime.graph.chatglm_cpp as cpp_model
elif model_name == "chatglm2":
import intel_extension_for_transformers.llm.runtime.graph.chatglm2_cpp as cpp_model
elif model_name == "baichuan":
import intel_extension_for_transformers.llm.runtime.graph.baichuan_cpp as cpp_model
else:
raise TypeError("Unspported model type {}!".format(model_name))
self.module = cpp_model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ compile_quant(quant_opt quant_model.cpp opt opt)
compile_quant(quant_bloom quant_model.cpp bloom bloom)

compile_quant(quant_chatglm quant_model.cpp chatglm chatglm)
compile_quant(quant_chatglm2 quant_model.cpp chatglm2 chatglm2)
compile_quant(quant_chatglm2 quant_model.cpp chatglm2 chatglm2)
compile_quant(quant_baichuan quant_model.cpp baichuan baichuan)

# all models running
if (NE_PYTHON_API)
Expand All @@ -82,6 +83,7 @@ set(mymap_opt 8)
set(mymap_bloom 9)
set(mymap_chatglm2 10)
set(mymap_chatglm 11)
set(mymap_baichuan 12)

function(compile_run TARGET SRC MODEL_NAME MODEL_LIB)
add_executable_w_warning(${TARGET} ${SRC})
Expand Down Expand Up @@ -110,4 +112,5 @@ compile_run(run_starcoder main_run.cpp starcoder starcoder)
compile_run(run_opt main_run.cpp opt opt)
compile_run(run_bloom main_run.cpp bloom bloom)
compile_run(run_chatglm2 main_run.cpp chatglm2 chatglm2)
compile_run(run_chatglm main_run.cpp chatglm chatglm)
compile_run(run_chatglm main_run.cpp chatglm chatglm)
compile_run(run_baichuan main_run.cpp baichuan baichuan)
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,10 @@ PYBIND11_MODULE(chatglm2_cpp, m)

PYBIND11_MODULE(chatglm_cpp, m)

#elif MODEL_NAME_ID == 12

PYBIND11_MODULE(baichuan_cpp, m)

#endif
{
m.doc() = "cpp model python binding";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ int main(int argc, char** argv) {
std::string prompt = build_prompt_glm2(prompts);
embd_inp = ::model_tokenize(ctx, prompt, false);
embd_inp.insert(embd_inp.begin(), {64790, 64792}); // special prefix
} else if (params.model_arch == MODEL_CHATGLM) {
} else if (params.model_arch == MODEL_CHATGLM or params.model_arch == MODEL_BAICHUAN) {
for (auto& i : params.ids) {
embd_inp.emplace_back(i);
}
Expand Down Expand Up @@ -548,7 +548,8 @@ int main(int argc, char** argv) {
}

// display text
if (params.model_arch == MODEL_CHATGLM || params.model_arch == MODEL_CHATGLM2) {
if (params.model_arch == MODEL_CHATGLM || params.model_arch == MODEL_CHATGLM2 ||
params.model_arch == MODEL_BAICHUAN) {
static bool is_prompt = true;
if (input_echo) {
if (is_prompt == true) {
Expand Down
2 changes: 2 additions & 0 deletions intel_extension_for_transformers/llm/runtime/graph/core/ne.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ struct ne_tensor {
char padding[8];
};

static const size_t NE_TENSOR_SIZE = sizeof(struct ne_tensor);

// computation graph
struct ne_cgraph {
int n_nodes;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ NE_API struct ne_tensor* ne_norm(struct ne_context* ctx, struct ne_tensor* a);

NE_API struct ne_tensor* ne_rms_norm(struct ne_context* ctx, struct ne_tensor* a);

NE_API struct ne_tensor* ne_rms_norm_inplace(struct ne_context* ctx, struct ne_tensor* a);
// a - x
// b - dy
NE_API struct ne_tensor* ne_rms_norm_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ add_subdirectory(falcon)
add_subdirectory(opt)
add_subdirectory(bloom)
add_subdirectory(chatglm)
add_subdirectory(baichuan)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set(TARGET baichuan)
add_library_w_warning(${TARGET} baichuan.cpp baichuan_utils.cpp ${MODEL_UTILS_SOURCE})
target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC ne_layers ${LLAMA_EXTRA_LIBS} jblas::jblas)
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
// Copyright (c) 2023 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cstring>
#include <exception>
#include <fstream>
#include <iterator>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>

#include "core/data_types.h"
#include "core/ne.h"
#include "core/ne_layers.h"
#include "core/ne_jblas.h"
#include "core/layers/mha_dense.h"
#include "models/model_utils/model_config.h"
#include "models/model_utils/model_utils.h"
#include "models/model_utils/util.h"

// evaluate the transformer
//
// - lctx: model context
// - tokens: new batch of tokens to process
// - n_past: the context size so far
// - n_threads: number of threads to use
//

static int flag = 0;
static int first_tokens_size = 0;
static bool baichuan_model_eval_internal(model_context& lctx, const model_token* tokens, const int n_tokens,
const int n_past, const int n_threads) {
const int64_t t_start_us = ne_time_us();

const int N = n_tokens;

const int batch_size = lctx.batch_size;
const auto& model = lctx.model;
const auto& hparams = model.hparams;

const auto& kv_self = model.kv_self;

MODEL_ASSERT(!!kv_self.ctx);

const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;

if (flag == 0) {
first_tokens_size = n_tokens;
flag++;
}

const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
const int n_rot = n_embd / n_head / 2;
const int num_attention_heads = n_head;

auto& mem_per_token = lctx.mem_per_token;
auto& buf_compute = lctx.buf_compute;

struct ne_init_params params = {
/*.mem_size =*/buf_compute.size,
/*.mem_buffer =*/buf_compute.addr,
/*.no_alloc =*/false,
};

struct ne_context* ctx0 = ne_init(params);

// for big probaichuans, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
memcpy(embd->data, tokens, N * ne_element_size(embd));

struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
int hidden_size = inpL->ne[0];
int qlen = inpL->ne[1];
int head_size = hidden_size / num_attention_heads;
for (int il = 0; il < n_layer; ++il) {
struct ne_tensor* cur;

lctx.use_buf(ctx0, 0);

struct ne_tensor* residual = inpL;

// LayerNorm
cur = ne_rms_norm(ctx0, inpL);
cur = ne_mul(ctx0, cur, model.layers[il].norm[0]);
// SelfAttention
{
// Linear::forward compute QKV
cur = ne_mul_mat(ctx0, model.layers[il].attn[0], cur);

ne_tensor* query_layer = ne_view_3d(ctx0, cur, head_size, n_head, N, head_size * ne_element_size(cur), cur->nb[1],
0); // [qlen, hidden]
query_layer = ne_permute(ctx0, query_layer, 0, 2, 1, 3); // [heads, qlen, head_size]

ne_tensor* key_layer =
ne_view_3d(ctx0, cur, head_size, num_attention_heads, qlen, head_size * ne_element_size(cur), cur->nb[1],
hidden_size * ne_element_size(cur));
key_layer = ne_permute(ctx0, key_layer, 0, 2, 1, 3); // [heads, qlen, head_size]

ne_tensor* value_layer =
ne_view_3d(ctx0, cur, head_size, num_attention_heads, qlen, head_size * ne_element_size(cur), cur->nb[1],
2 * hidden_size * ne_element_size(cur)); // [qlen, heads, head_size]
value_layer = ne_permute(ctx0, value_layer, 1, 2, 0, 3); // [heads, head_size, qlen]

// store key and value to memory
{
struct ne_tensor* k_cache_view =
ne_view_3d(ctx0, model.layers[il].k_cache, head_size, qlen, num_attention_heads,
model.layers[il].k_cache->nb[1], model.layers[il].k_cache->nb[2],
n_past * head_size * ne_element_size(model.layers[il].k_cache)); // [kv_heads, qlen, head_size]

struct ne_tensor* v_cache_view =
ne_view_3d(ctx0, model.layers[il].v_cache, qlen, head_size, num_attention_heads,
model.layers[il].v_cache->nb[1], model.layers[il].v_cache->nb[2],
n_past * ne_element_size(model.layers[il].v_cache)); // [kv_heads, head_size, qlen]

ne_build_forward_expand(&gf, ne_cpy(ctx0, key_layer, k_cache_view));
ne_build_forward_expand(&gf, ne_cpy(ctx0, value_layer, v_cache_view));
}
// concat key & value with past kv
key_layer = ne_view_3d(ctx0, model.layers[il].k_cache, head_size, n_past + qlen, num_attention_heads,
model.layers[il].k_cache->nb[1], model.layers[il].k_cache->nb[2],
0); // [kv_heads, klen, head_size]
value_layer = ne_view_3d(ctx0, model.layers[il].v_cache, n_past + qlen, head_size, num_attention_heads,
model.layers[il].v_cache->nb[1], model.layers[il].v_cache->nb[2],
0); // [kv_heads, head_size, klen]

// attention
struct ne_tensor* attn_scores = ne_mul_mat(ctx0, key_layer, query_layer); // [heads, qlen, klen]
attn_scores = ne_scale_inplace(ctx0, attn_scores, ne_new_f32(ctx0, 1.f / std::sqrt(head_size)));
attn_scores = ne_alibi(ctx0, attn_scores, n_past, num_attention_heads, 8);
if (n_past == 0) {
attn_scores = ne_diag_mask_inf_inplace(ctx0, attn_scores, n_past);
}
ne_tensor* attn_probs = ne_soft_max_inplace(ctx0, attn_scores); // [heads, qlen, klen]

// ne_compute_forward_mul_mat_f16_f32
ne_tensor* context_layer = ne_mul_mat(ctx0, value_layer, attn_probs); // [heads, qlen, head_size]
context_layer = ne_cont(ctx0, ne_permute(ctx0, context_layer, 0, 2, 1, 3));
context_layer = ne_reshape_2d(ctx0, context_layer, hidden_size, qlen);

// F32 mul_mat
cur = ne_mul_mat(ctx0, model.layers[il].attn[1], context_layer);
}

lctx.use_buf(ctx0, 1);
cur = ne_add_inplace(ctx0, cur, residual);
residual = cur;

// post_attention_layernorm
struct ne_tensor* hidden_states = ne_rms_norm(ctx0, cur);
hidden_states = ne_mul(ctx0, hidden_states, model.layers[il].norm[1]);

// mlp.forward
struct ne_tensor* gate = ne_mul_mat(ctx0, model.layers[il].ffn[0], hidden_states);
gate = ne_silu(ctx0, gate);
struct ne_tensor* up = ne_mul_mat(ctx0, model.layers[il].ffn[1], hidden_states);
struct ne_tensor* mlp_output = ne_mul(ctx0, gate, up);
mlp_output = ne_mul_mat(ctx0, model.layers[il].ffn[2], mlp_output);

inpL = ne_add_inplace(ctx0, mlp_output, residual);
}

lctx.use_buf(ctx0, 0);
// used at the end to optionally extract the embeddings
struct ne_tensor* embeddings = NULL;
// norm
{
inpL = ne_rms_norm(ctx0, inpL);
inpL = ne_mul(ctx0, inpL, model.others[1]);
}

lctx.use_buf(ctx0, -1);
if (embd->ne[0] > 1) {
inpL = ne_view_1d(ctx0, inpL, hidden_size, (embd->ne[0] - 1) * hidden_size * ne_element_size(inpL));
}

// lm_head
inpL = ne_mul_mat(ctx0, model.others[2], inpL);

ne_build_forward_expand(&gf, inpL);
ne_graph_compute(ctx0, &gf);

#ifdef NE_PERF
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
if (engine_profiling_) {
ne_graph_profiling(&gf);
}
#endif

// update kv token count
lctx.model.kv_self.n = n_past + N;

// extract logits
{
auto& logits_out = lctx.logits;

if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float*)ne_get_data(inpL), sizeof(float) * n_vocab * N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float*)ne_get_data(inpL), sizeof(float) * n_vocab);
}
}

// extract embeddings
if (!lctx.embedding.empty()) {
auto& embedding_out = lctx.embedding;

embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float*)ne_get_data(embeddings) + (n_embd * (N - 1)), sizeof(float) * n_embd);
}

if (mem_per_token == 0) {
mem_per_token = ne_used_mem(ctx0) / N;
}

ne_free(ctx0);

// measure the performance only for the single-token evals
int64_t time_interval = ne_time_us() - t_start_us;
if (N == 1) {
lctx.t_eval_us += time_interval;
lctx.n_eval++;
} else if (N > 1) {
lctx.t_p_eval_us += time_interval;
lctx.n_p_eval += N;
}
lctx.eval_times.push_back(time_interval);

return true;
}

int model_eval(struct model_context* ctx, const model_token* tokens, int n_tokens, int n_past, int n_threads) {
if (!baichuan_model_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
fprintf(stderr, "%s: failed to eval\n", __func__);
return 1;
}

// get a more accurate load time, upon first eval
// TODO: fix this
if (!ctx->has_evaluated_once) {
ctx->t_load_us = ne_time_us() - ctx->t_start_us;
ctx->has_evaluated_once = true;
}

return 0;
}
Loading

0 comments on commit 98e5f9a

Please sign in to comment.