Skip to content

Commit

Permalink
llama : fix embeddings
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Feb 29, 2024
1 parent 87c91c0 commit 008f3fc
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 61 deletions.
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1284,7 +1284,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.mul_mat_q = params.mul_mat_q;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale;
Expand Down
13 changes: 9 additions & 4 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ static std::vector<std::string> split_lines(const std::string & s) {

static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
}
}

Expand All @@ -45,9 +45,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}

// normalize on copy
for (int k = 0; k < n_seq; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
float * out = output + k * n_embd;
for (int i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
}

float * emb = llama_get_embeddings_ith(ctx, i);
float * out = output + batch.seq_id[i][0] * n_embd;
normalize(emb, out, n_embd);
}
}
Expand Down Expand Up @@ -145,6 +149,7 @@ int main(int argc, char ** argv) {
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
auto & inp = inputs[k];

const uint64_t n_toks = inp.size();

// encode if at capacity
Expand Down
34 changes: 34 additions & 0 deletions examples/server-embd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import asyncio
import requests
import numpy as np

n = 8

result = []

async def requests_post_async(*args, **kwargs):
return await asyncio.to_thread(requests.post, *args, **kwargs)

async def main():
model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding",
json= {"content": "0"*1024}
) for i in range(n)])

for response in responses:
embedding = response.json()["embedding"]
print(embedding[-8:])
result.append(embedding)

asyncio.run(main())

# compute cosine similarity

for i in range(n-1):
for j in range(i+1, n):
embedding1 = np.array(result[i])
embedding2 = np.array(result[j])
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
print(f"Similarity between {i} and {j}: {similarity:.2f}")

29 changes: 18 additions & 11 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1271,7 +1271,7 @@ struct llama_server_context
queue_results.send(res);
}

void send_embedding(llama_client_slot &slot)
void send_embedding(llama_client_slot &slot, const llama_batch & batch)
{
task_result res;
res.id = slot.task_id;
Expand All @@ -1280,6 +1280,7 @@ struct llama_server_context
res.stop = true;

const int n_embd = llama_n_embd(model);

if (!params.embedding)
{
LOG_WARNING("embedding disabled", {
Expand All @@ -1292,12 +1293,19 @@ struct llama_server_context
}
else
{
const float *data = llama_get_embeddings(ctx);
std::vector<float> embedding(data, data + n_embd);
res.result_json = json
{
{"embedding", embedding },
};
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}

const float * data = llama_get_embeddings_ith(ctx, i);
std::vector<float> embedding(data, data + n_embd);

res.result_json = json
{
{"embedding", embedding },
};
}
}
queue_results.send(res);
}
Expand Down Expand Up @@ -1891,7 +1899,7 @@ struct llama_server_context
ga_i += ga_w/ga_n;
}
}
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
slot_npast++;
}

Expand Down Expand Up @@ -1927,7 +1935,7 @@ struct llama_server_context

for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);

for (auto & slot : slots)
{
Expand Down Expand Up @@ -2000,7 +2008,7 @@ struct llama_server_context
// prompt evaluated for embedding
if (slot.embedding)
{
send_embedding(slot);
send_embedding(slot, batch_view);
slot.release();
slot.i_batch = -1;
continue;
Expand Down Expand Up @@ -2359,7 +2367,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
}
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
{
Expand Down
94 changes: 53 additions & 41 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1646,6 +1646,7 @@ struct llama_cparams {
float defrag_thold;

bool mul_mat_q;
bool embeddings;
bool offload_kqv;
bool do_pooling;

Expand Down Expand Up @@ -1936,16 +1937,16 @@ struct llama_context {
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls

// decode output (2-dimensional array: [n_tokens][n_vocab])
// logits output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
#ifndef NDEBUG
// guard against access to unset logits
std::vector<bool> logits_valid;
#endif
bool logits_all = false;

// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
// embeddings output (2-dimensional array: [n_tokens][n_embd])
std::vector<float> embeddings;

// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
Expand Down Expand Up @@ -5987,9 +5988,10 @@ struct llm_build_context {

// get input vectors with right size
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);

struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);

// construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
Expand Down Expand Up @@ -7971,17 +7973,17 @@ static int llama_decode_internal(
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);

// the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];

if (strcmp(res->name, "result_output") == 0) {
// the embeddings could be the second to last tensor, or the third to last tensor
if (strcmp(embeddings->name, "result_norm") != 0) {
embeddings = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
if (strcmp(embd->name, "result_norm") != 0) {
embd = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
}
} else if (strcmp(res->name, "result_embd") == 0) {
embeddings = res;
embd = res;
res = nullptr;
} else {
GGML_ASSERT(false);
Expand Down Expand Up @@ -8051,46 +8053,53 @@ static int llama_decode_internal(
logits_out.clear();
#endif

ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
GGML_ASSERT(res_backend != nullptr);
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
GGML_ASSERT(backend_res != nullptr);

if (batch.logits) {
logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) {
continue;
}
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[i] = true;
#endif
}
} else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
#ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif
} else {
logits_out.resize(n_vocab);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[0] = true;
#endif
}
ggml_backend_synchronize(res_backend);
ggml_backend_synchronize(backend_res);
}

// extract embeddings
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
if (cparams.embeddings && embd) {
auto & embeddings_out = lctx.embeddings;

const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);

embedding_out.resize(embd_size);
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
ggml_backend_synchronize(embeddings_backend);
if (batch.logits) {
embeddings_out.resize(n_embd * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) {
continue;
}
ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
}
}
ggml_backend_synchronize(backend_embd);
}

// measure the performance only for the single-token evals
Expand Down Expand Up @@ -11634,7 +11643,7 @@ struct llama_context_params llama_context_default_params() {
/*.type_v =*/ GGML_TYPE_F16,
/*.mul_mat_q =*/ true,
/*.logits_all =*/ false,
/*.embedding =*/ false,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.do_pooling =*/ true,
};
Expand Down Expand Up @@ -11785,6 +11794,7 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.mul_mat_q = params.mul_mat_q;
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.do_pooling = params.do_pooling;

Expand Down Expand Up @@ -11933,8 +11943,8 @@ struct llama_context * llama_new_context_with_model(
// resized during inference, reserve maximum
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);

if (params.embedding) {
ctx->embedding.resize(hparams.n_embd);
if (params.embeddings) {
ctx->embeddings.reserve(hparams.n_embd*cparams.n_batch);
}

// graph inputs
Expand Down Expand Up @@ -12369,7 +12379,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
// assume worst case for logits although only currently set ones are serialized
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
const size_t s_embedding_size = sizeof(size_t);
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
const size_t s_embedding = ctx->embeddings.capacity() * sizeof(float);
const size_t s_kv_size = sizeof(size_t);
const size_t s_kv_ntok = sizeof(int);
const size_t s_kv = ctx->kv_self.total_size();
Expand Down Expand Up @@ -12470,12 +12480,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat

// copy embeddings
{
const size_t embedding_size = ctx->embedding.size();
const size_t embeddings_size = ctx->embeddings.size();

data_ctx->write(&embedding_size, sizeof(embedding_size));
data_ctx->write(&embeddings_size, sizeof(embeddings_size));

if (embedding_size) {
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
if (embeddings_size) {
data_ctx->write(ctx->embeddings.data(), embeddings_size * sizeof(float));
}
}

Expand Down Expand Up @@ -12581,15 +12591,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

// set embeddings
{
size_t embedding_size;
size_t embeddings_size;

memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);

memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
GGML_ASSERT(ctx->embeddings.capacity() == embeddings_size);

GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
if (embeddings_size) {
ctx->embeddings.resize(embeddings_size);

if (embedding_size) {
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
inp += embedding_size * sizeof(float);
memcpy(ctx->embeddings.data(), inp, embeddings_size * sizeof(float));
inp += embeddings_size * sizeof(float);
}
}

Expand Down Expand Up @@ -12829,11 +12841,11 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
}

float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
return ctx->embeddings.data();
}

float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
return ctx->embeddings.data() + i*ctx->model.hparams.n_embd;
}

const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
Expand Down
Loading

0 comments on commit 008f3fc

Please sign in to comment.