ggerganov · ggerganov · Mar 26, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 16, 2024
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
@@ -403,6 +403,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
+            // TODO: use batch.logits to save computations instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return false;

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -132,7 +132,6 @@ int main(int argc, char ** argv) {
     llama_context * ctx = NULL;
 
     // load the target model
-    params.logits_all = true;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
     // load the prompts from an external file if there are any

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -744,7 +744,8 @@ struct server_context {
         {
             const int32_t n_batch = llama_n_batch(ctx);
 
-            batch = llama_batch_init(n_batch, 0, params.n_parallel);
+            // only a single seq_id per token is needed
+            batch = llama_batch_init(n_batch, 0, 1);
         }
 
         metrics.init();

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -65,7 +65,6 @@ int main(int argc, char ** argv) {
     llama_context * ctx_dft = NULL;
 
     // load the target model
-    params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
     // load the draft model