Add stream mode to chatbot

Clean functions
neuralmagic · Sep 26, 2023 · 247682e · 247682e
1 parent 2d0293d
commit 247682e
Showing 1 changed file with 50 additions and 18 deletions.
diff --git a/src/deepsparse/transformers/infer.py b/src/deepsparse/transformers/infer.py
@@ -43,7 +43,10 @@
   --task TEXT                     The task to use for the pipeline. Choose any
                                   of `chat`, `codegen`, `text-generation`
                                   [default: chat]
-  --help                          Show this message and exit.
+  --stream / --no_stream          Whether to stream output as generated or not
+                                  [default: no_stream]
+  --help                          Show this message and exit.  [default:
+                                  False]
 
 Installation: pip install deepsparse[transformers]
 Examples:
@@ -62,6 +65,10 @@
 4) Disable history
 deepsparse.infer models/llama/deployment \
     --task text-generation
+
+5) Stream output
+deepsparse.infer models/llama/deployment \
+    --stream
 """
 import click
 
@@ -110,13 +117,20 @@
     help="The task to use for the pipeline. Choose any of "
     "`chat`, `codegen`, `text-generation`",
 )
+@click.option(
+    "--stream/--no_stream",
+    is_flag=True,
+    default=False,
+    help="Whether to stream output as generated or not",
+)
 def main(
     model_path: str,
     sequence_length: int,
     sampling_temperature: float,
     prompt_sequence_length: int,
     show_tokens_per_sec: bool,
     task: str,
+    stream: bool,
 ):
     """
     Command Line utility to interact with a text genration LLM in a chatbot style
@@ -131,32 +145,50 @@ def main(
         task=task,  # let pipeline determine if task is supported
         model_path=model_path,
         sequence_length=sequence_length,
-        sampling_temperature=sampling_temperature,
         prompt_sequence_length=prompt_sequence_length,
     )
 
     # continue prompts until a keyboard interrupt
     while True:
         input_text = input("User: ")
-        pipeline_inputs = {"prompt": [input_text]}
-
+        pipeline_inputs = dict(
+            prompt=[input_text],
+            sampling_temperature=sampling_temperature,
+            )
+
         if SupportedTasks.is_chat(task):
             pipeline_inputs["session_ids"] = session_ids
+
+        response = pipeline(**pipeline_inputs, streaming=stream)
+        _display_bot_response(stream, response)
 
-        response = pipeline(**pipeline_inputs)
-        print("Bot: ", response.generations[0].text)
         if show_tokens_per_sec:
-            times = pipeline.timer_manager.times
-            prefill_speed = (
-                1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
-            )
-            generation_speed = 1.0 / times["engine_token_generation_single"]
-            print(
-                f"[prefill: {prefill_speed:.2f} tokens/sec]",
-                f"[decode: {generation_speed:.2f} tokens/sec]",
-                sep="\n",
-            )
+            _display_generation_speed(prompt_sequence_length, pipeline)
+
+
+def _display_generation_speed(prompt_sequence_length, pipeline):
+    # display prefill and generation speed(s) in tokens/sec
+    times = pipeline.timer_manager.times
+    prefill_speed = 1.0 * prompt_sequence_length / times["engine_prompt_prefill_single"]
+    generation_speed = 1.0 / times["engine_token_generation_single"]
+    print(
+        f"[prefill: {prefill_speed:.2f} tokens/sec]",
+        f"[decode: {generation_speed:.2f} tokens/sec]",
+        sep="\n",
+    )
+
+
+def _display_bot_response(stream: bool, response):
+    # print response from pipeline, streaming or not
+
+    print("Bot:", end=" ")
+    if stream:
+        for generation in response:
+            print(generation.generations[0].text, end=" ")
+        print()
+    else:
+        print(response.generations[0].text)
 
 
-if __name__ == "__main__":
-    main()
+if "__main__" == __name__:
+    main()