diff --git a/base/cortex-common/enginei.h b/base/cortex-common/enginei.h index b352952..06e54aa 100644 --- a/base/cortex-common/enginei.h +++ b/base/cortex-common/enginei.h @@ -31,7 +31,7 @@ class EngineI { virtual bool IsSupported(const std::string& f) { if (f == "HandleChatCompletion" || f == "HandleEmbedding" || f == "LoadModel" || f == "UnloadModel" || f == "GetModelStatus" || - f == "GetModels") { + f == "GetModels" || f == "SetFileLogger") { return true; } return false; diff --git a/llama.cpp b/llama.cpp index 7596487..8b836ae 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 7596487bebd58eade3cd0133d42a9008aaaf9d09 +Subproject commit 8b836ae731bbb2c5640bc47df5b0a78ffcb129cb diff --git a/src/llama_engine.cc b/src/llama_engine.cc index c361618..7bdd5dd 100644 --- a/src/llama_engine.cc +++ b/src/llama_engine.cc @@ -182,7 +182,7 @@ LlamaEngine::LlamaEngine(int log_option) { asynce_file_logger_ = std::make_unique(); } - log_disable(); + gpt_log_pause(gpt_log_main()); llama_log_set( [](ggml_log_level level, const char* text, void* user_data) { @@ -367,7 +367,7 @@ void LlamaEngine::GetModels( callback(std::move(status), std::move(json_resp)); LOG_INFO << "Running models responded"; } -// should decrepted this function because it no longer used in cortex cpp + void LlamaEngine::SetFileLogger(int max_log_lines, const std::string& log_path) { if (!asynce_file_logger_) { @@ -511,10 +511,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr json_body) { LOG_DEBUG << "stop: " << server_map_[model_id].stop_words.toStyledString(); if (!json_body->operator[]("llama_log_folder").isNull()) { - log_enable(); + gpt_log_resume(gpt_log_main()); std::string llama_log_folder = json_body->operator[]("llama_log_folder").asString(); - log_set_target(llama_log_folder + "llama.log"); + llama_log_folder += "llama.log"; + gpt_log_set_file(gpt_log_main(), llama_log_folder.c_str()); } // Set folder for llama log } if (params.model_alias == "unknown") { @@ -749,15 +750,16 @@ void LlamaEngine::HandleInferenceImpl( auto state = CreateInferenceState(si.ctx); // Queued task - si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id, n_probs]() { + si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id, + n_probs]() { state->task_id = state->llama.RequestCompletion(data, false, false, -1); while (state->llama.model_loaded_external) { TaskResult result = state->llama.NextResult(state->task_id); if (!result.error) { std::string to_send; - if (n_probs > 0){ + if (n_probs > 0) { to_send = result.result_json["completion_probabilities"].dump(); - }else{ + } else { to_send = result.result_json["content"]; } // trim the leading space if it is the first token diff --git a/src/llama_engine.h b/src/llama_engine.h index fa9c01e..87c7b2d 100644 --- a/src/llama_engine.h +++ b/src/llama_engine.h @@ -30,7 +30,7 @@ class LlamaEngine : public EngineI { void GetModels( std::shared_ptr jsonBody, std::function&& callback) final; - void SetFileLogger(int max_log_lines, const std::string& log_path); + void SetFileLogger(int max_log_lines, const std::string& log_path) final; void SetLoggerOption(const Json::Value& json_body); private: diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc index e8e7205..d6de582 100644 --- a/src/llama_server_context.cc +++ b/src/llama_server_context.cc @@ -979,11 +979,10 @@ void LlamaServerContext::SendFinalResponse(LlamaClientSlot& slot) { slot.generated_token_probs.begin(), slot.generated_token_probs.begin() + slot.sent_token_probs_index); } - if(!slot.params.stream ){ + if (!slot.params.stream) { res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs); - } - else{ + probs_vector_to_json(ctx, probs); + } else { res.result_json["completion_probabilities"] = std::move(json()); } } @@ -1491,8 +1490,9 @@ bool LlamaServerContext::UpdateSlots() { slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", - slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past + << " tokens | to process: " + << slot.num_prompt_tokens_processed << " tokens"; } } diff --git a/src/llama_server_context.h b/src/llama_server_context.h index aca30b6..57a5367 100644 --- a/src/llama_server_context.h +++ b/src/llama_server_context.h @@ -4,8 +4,11 @@ #include #include #include +#include #include +#include "log.h" + // External #include "llama_client_slot.h" @@ -116,7 +119,7 @@ struct LlamaServerContext { bool clean_kv_cache = true; bool all_slots_are_idle = false; bool add_bos_token = true; - bool has_eos_token = false; + bool has_eos_token = false; std::atomic id_gen; int32_t n_ctx; // total context for all clients / slots