diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index a6b07318ea..5959812ca4 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -829,6 +829,11 @@ def engine_forward( generated_tokens.append(token) generated_logits.append(logits) + if session.total_num_processed_tokens >= session.capacity: + # if the kv cache is full, stop generation + finished_reason.append(FinishReason.CAPACITY) + break + if ( token == self.tokenizer.eos_token_id and not self.force_max_tokens