Skip to content

Commit

Permalink
Update llama.cpp submodule to latest release b3778 (#233)
Browse files Browse the repository at this point in the history
* Update submodule to latest release b3778

* fix: api

* f:m

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: vansangpfiev <sang@jan.ai>
  • Loading branch information
3 people committed Sep 18, 2024
1 parent 88a7411 commit 206a82c
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 17 deletions.
2 changes: 1 addition & 1 deletion base/cortex-common/enginei.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class EngineI {
virtual bool IsSupported(const std::string& f) {
if (f == "HandleChatCompletion" || f == "HandleEmbedding" ||
f == "LoadModel" || f == "UnloadModel" || f == "GetModelStatus" ||
f == "GetModels") {
f == "GetModels" || f == "SetFileLogger") {
return true;
}
return false;
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 74 files
+3 −0 .github/workflows/build.yml
+6 −0 .github/workflows/server.yml
+3 −3 CMakeLists.txt
+30 −10 Makefile
+2 −0 README.md
+3 −0 ci/run.sh
+12 −10 common/CMakeLists.txt
+59 −60 common/arg.cpp
+157 −53 common/common.cpp
+11 −3 common/common.h
+401 −0 common/log.cpp
+64 −698 common/log.h
+3 −0 common/ngram-cache.cpp
+1 −1 common/sampling.cpp
+2 −0 common/train.cpp
+163 −8 convert_hf_to_gguf.py
+19 −16 examples/batched-bench/batched-bench.cpp
+23 −26 examples/batched/batched.cpp
+48 −45 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+5 −4 examples/cvector-generator/cvector-generator.cpp
+54 −53 examples/embedding/embedding.cpp
+25 −25 examples/eval-callback/eval-callback.cpp
+1 −1 examples/export-lora/export-lora.cpp
+9 −2 examples/gguf-split/gguf-split.cpp
+2 −0 examples/gritlm/gritlm.cpp
+49 −52 examples/imatrix/imatrix.cpp
+84 −98 examples/infill/infill.cpp
+74 −70 examples/llava/clip.cpp
+29 −38 examples/llava/llava-cli.cpp
+34 −24 examples/llava/llava.cpp
+40 −46 examples/llava/minicpmv-cli.cpp
+26 −29 examples/lookahead/lookahead.cpp
+15 −14 examples/lookup/lookup-stats.cpp
+26 −29 examples/lookup/lookup.cpp
+2 −0 examples/main/README.md
+143 −163 examples/main/main.cpp
+32 −34 examples/parallel/parallel.cpp
+30 −29 examples/passkey/passkey.cpp
+169 −174 examples/perplexity/perplexity.cpp
+36 −34 examples/retrieval/retrieval.cpp
+2 −5 examples/server/CMakeLists.txt
+1 −2 examples/server/README.md
+0 −1 examples/server/bench/README.md
+0 −1 examples/server/bench/bench.py
+210 −356 examples/server/server.cpp
+1 −0 examples/server/tests/.gitignore
+0 −1 examples/server/tests/README.md
+0 −2 examples/server/tests/features/steps/steps.py
+19 −91 examples/server/utils.hpp
+21 −22 examples/simple/simple.cpp
+51 −56 examples/speculative/speculative.cpp
+24 −26 examples/tokenize/tokenize.cpp
+6 −6 flake.lock
+11 −2 ggml/CMakeLists.txt
+5 −4 ggml/include/ggml.h
+5 −7 ggml/src/CMakeLists.txt
+1 −0 ggml/src/ggml-aarch64.c
+614 −0 ggml/src/ggml-cpu-impl.h
+13 −609 ggml/src/ggml-impl.h
+33 −22 ggml/src/ggml-metal.m
+34 −36 ggml/src/ggml-quants.c
+75 −48 ggml/src/ggml.c
+39 −0 ggml/src/llamafile/sgemm.cpp
+56 −0 gguf-py/gguf/constants.py
+9 −0 gguf-py/gguf/gguf_writer.py
+15 −15 gguf-py/gguf/tensor_mapping.py
+1 −0 include/llama.h
+1 −0 src/llama-impl.h
+613 −24 src/llama.cpp
+1 −0 src/unicode.cpp
+2 −0 tests/CMakeLists.txt
+1 −1 tests/test-arg-parser.cpp
+93 −0 tests/test-barrier.cpp
+39 −0 tests/test-log.cpp
16 changes: 9 additions & 7 deletions src/llama_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ LlamaEngine::LlamaEngine(int log_option) {
asynce_file_logger_ = std::make_unique<trantor::FileLogger>();
}

log_disable();
gpt_log_pause(gpt_log_main());

llama_log_set(
[](ggml_log_level level, const char* text, void* user_data) {
Expand Down Expand Up @@ -367,7 +367,7 @@ void LlamaEngine::GetModels(
callback(std::move(status), std::move(json_resp));
LOG_INFO << "Running models responded";
}
// should decrepted this function because it no longer used in cortex cpp

void LlamaEngine::SetFileLogger(int max_log_lines,
const std::string& log_path) {
if (!asynce_file_logger_) {
Expand Down Expand Up @@ -511,10 +511,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
LOG_DEBUG << "stop: " << server_map_[model_id].stop_words.toStyledString();

if (!json_body->operator[]("llama_log_folder").isNull()) {
log_enable();
gpt_log_resume(gpt_log_main());
std::string llama_log_folder =
json_body->operator[]("llama_log_folder").asString();
log_set_target(llama_log_folder + "llama.log");
llama_log_folder += "llama.log";
gpt_log_set_file(gpt_log_main(), llama_log_folder.c_str());
} // Set folder for llama log
}
if (params.model_alias == "unknown") {
Expand Down Expand Up @@ -749,15 +750,16 @@ void LlamaEngine::HandleInferenceImpl(
auto state = CreateInferenceState(si.ctx);

// Queued task
si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id, n_probs]() {
si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id,
n_probs]() {
state->task_id = state->llama.RequestCompletion(data, false, false, -1);
while (state->llama.model_loaded_external) {
TaskResult result = state->llama.NextResult(state->task_id);
if (!result.error) {
std::string to_send;
if (n_probs > 0){
if (n_probs > 0) {
to_send = result.result_json["completion_probabilities"].dump();
}else{
} else {
to_send = result.result_json["content"];
}
// trim the leading space if it is the first token
Expand Down
2 changes: 1 addition & 1 deletion src/llama_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class LlamaEngine : public EngineI {
void GetModels(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
void SetFileLogger(int max_log_lines, const std::string& log_path);
void SetFileLogger(int max_log_lines, const std::string& log_path) final;
void SetLoggerOption(const Json::Value& json_body);

private:
Expand Down
12 changes: 6 additions & 6 deletions src/llama_server_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -979,11 +979,10 @@ void LlamaServerContext::SendFinalResponse(LlamaClientSlot& slot) {
slot.generated_token_probs.begin(),
slot.generated_token_probs.begin() + slot.sent_token_probs_index);
}
if(!slot.params.stream ){
if (!slot.params.stream) {
res.result_json["completion_probabilities"] =
probs_vector_to_json(ctx, probs);
}
else{
probs_vector_to_json(ctx, probs);
} else {
res.result_json["completion_probabilities"] = std::move(json());
}
}
Expand Down Expand Up @@ -1491,8 +1490,9 @@ bool LlamaServerContext::UpdateSlots() {
slot.num_prompt_tokens_processed =
slot.num_prompt_tokens - slot.n_past;

LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n",
slot.id, slot.n_past, slot.num_prompt_tokens_processed);
LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
<< " tokens | to process: "
<< slot.num_prompt_tokens_processed << " tokens";
}
}

Expand Down
5 changes: 4 additions & 1 deletion src/llama_server_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
#include <mutex>
#include <set>
#include <string>
#include <thread>
#include <vector>

#include "log.h"

// External

#include "llama_client_slot.h"
Expand Down Expand Up @@ -116,7 +119,7 @@ struct LlamaServerContext {
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
bool add_bos_token = true;
bool has_eos_token = false;
bool has_eos_token = false;

std::atomic<int32_t> id_gen;
int32_t n_ctx; // total context for all clients / slots
Expand Down

0 comments on commit 206a82c

Please sign in to comment.