Update llama.cpp submodule to latest release b3778 (#233)

* Update submodule to latest release b3778 * fix: api * f:m --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: vansangpfiev <sang@jan.ai>
janhq · Sep 18, 2024 · 206a82c · 206a82c
1 parent 88a7411
commit 206a82c
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 17 deletions.
diff --git a/base/cortex-common/enginei.h b/base/cortex-common/enginei.h
@@ -31,7 +31,7 @@ class EngineI {
   virtual bool IsSupported(const std::string& f) {
     if (f == "HandleChatCompletion" || f == "HandleEmbedding" ||
         f == "LoadModel" || f == "UnloadModel" || f == "GetModelStatus" ||
-        f == "GetModels") {
+        f == "GetModels" || f == "SetFileLogger") {
       return true;
     }
     return false;

diff --git a/llama.cpp b/llama.cpp
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -182,7 +182,7 @@ LlamaEngine::LlamaEngine(int log_option) {
     asynce_file_logger_ = std::make_unique<trantor::FileLogger>();
   }
 
-  log_disable();
+  gpt_log_pause(gpt_log_main());
 
   llama_log_set(
       [](ggml_log_level level, const char* text, void* user_data) {
@@ -367,7 +367,7 @@ void LlamaEngine::GetModels(
   callback(std::move(status), std::move(json_resp));
   LOG_INFO << "Running models responded";
 }
-// should decrepted this function because it no longer used in cortex cpp
+
 void LlamaEngine::SetFileLogger(int max_log_lines,
                                 const std::string& log_path) {
   if (!asynce_file_logger_) {
@@ -511,10 +511,11 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
     LOG_DEBUG << "stop: " << server_map_[model_id].stop_words.toStyledString();
 
     if (!json_body->operator[]("llama_log_folder").isNull()) {
-      log_enable();
+      gpt_log_resume(gpt_log_main());
       std::string llama_log_folder =
           json_body->operator[]("llama_log_folder").asString();
-      log_set_target(llama_log_folder + "llama.log");
+      llama_log_folder += "llama.log";
+      gpt_log_set_file(gpt_log_main(), llama_log_folder.c_str());
     }  // Set folder for llama log
   }
   if (params.model_alias == "unknown") {
@@ -749,15 +750,16 @@ void LlamaEngine::HandleInferenceImpl(
     auto state = CreateInferenceState(si.ctx);
 
     // Queued task
-    si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id, n_probs]() {
+    si.q->runTaskInQueue([cb = std::move(callback), state, data, request_id,
+                          n_probs]() {
       state->task_id = state->llama.RequestCompletion(data, false, false, -1);
       while (state->llama.model_loaded_external) {
         TaskResult result = state->llama.NextResult(state->task_id);
         if (!result.error) {
           std::string to_send;
-          if (n_probs > 0){
+          if (n_probs > 0) {
             to_send = result.result_json["completion_probabilities"].dump();
-          }else{
+          } else {
             to_send = result.result_json["content"];
           }
           // trim the leading space if it is the first token

diff --git a/src/llama_engine.h b/src/llama_engine.h
@@ -30,7 +30,7 @@ class LlamaEngine : public EngineI {
   void GetModels(
       std::shared_ptr<Json::Value> jsonBody,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void SetFileLogger(int max_log_lines, const std::string& log_path);
+  void SetFileLogger(int max_log_lines, const std::string& log_path) final;
   void SetLoggerOption(const Json::Value& json_body);
 
  private:

diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
@@ -979,11 +979,10 @@ void LlamaServerContext::SendFinalResponse(LlamaClientSlot& slot) {
           slot.generated_token_probs.begin(),
           slot.generated_token_probs.begin() + slot.sent_token_probs_index);
     }
-    if(!slot.params.stream ){
+    if (!slot.params.stream) {
       res.result_json["completion_probabilities"] =
-        probs_vector_to_json(ctx, probs);
-    }
-    else{
+          probs_vector_to_json(ctx, probs);
+    } else {
       res.result_json["completion_probabilities"] = std::move(json());
     }
   }
@@ -1491,8 +1490,9 @@ bool LlamaServerContext::UpdateSlots() {
               slot.num_prompt_tokens_processed =
                   slot.num_prompt_tokens - slot.n_past;
 
-              LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n",
-                      slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+              LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
+                        << " tokens | to process: "
+                        << slot.num_prompt_tokens_processed << " tokens";
             }
           }
 

diff --git a/src/llama_server_context.h b/src/llama_server_context.h
@@ -4,8 +4,11 @@
 #include <mutex>
 #include <set>
 #include <string>
+#include <thread>
 #include <vector>
 
+#include "log.h"
+
 // External
 
 #include "llama_client_slot.h"
@@ -116,7 +119,7 @@ struct LlamaServerContext {
   bool clean_kv_cache = true;
   bool all_slots_are_idle = false;
   bool add_bos_token = true;
-  bool has_eos_token  = false;
+  bool has_eos_token = false;
 
   std::atomic<int32_t> id_gen;
   int32_t n_ctx;  // total context for all clients / slots