neuralmagic · mgoin · Oct 11, 2023 · Oct 10, 2023 · Oct 10, 2023
diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
@@ -267,6 +267,17 @@ def parse_args():
             "engine class will be dynamically imported during runtime"
         ),
     )
+    parser.add_argument(
+        "--internal-kv-cache",
+        "--internal_kv_cache",
+        help=(
+            "DeepSparse engine only - If True, and a model with KV cache, "
+            "KV Cache state will be managed within the compiled deepsparse "
+            "model. This is preferred when applicable for best performance"
+        ),
+        action="store_true",
+        default=False,
+    )
     parser.add_argument(
         "-q",
         "--quiet",
@@ -314,6 +325,7 @@ def benchmark_model(
     input_ids_length: Optional[int] = 1,
     thread_pinning: str = "core",
     engine: str = DEEPSPARSE_ENGINE,
+    internal_kv_cache: bool = False,
     quiet: bool = False,
     export_path: Optional[str] = None,
 ) -> Dict:
@@ -332,6 +344,7 @@ def benchmark_model(
     orig_model_path = model_path
     model_path = model_to_path(model_path)
 
+    cached_outputs = None
     if sequence_length and input_ids_length and has_model_kv_cache(model_path):
         if input_ids_length > sequence_length:
             raise ValueError(
@@ -346,12 +359,18 @@ def benchmark_model(
             f"sequence length: {sequence_length}."
         )
 
-        model_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
+        model_path, cached_outs, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
             onnx_file_path=model_path,
             input_ids_length=input_ids_length,
             sequence_length=sequence_length,
             batch_size=batch_size,
         )
+
+        if internal_kv_cache:
+            _LOGGER.info(
+                "Benchmarking DeepSparse Engine with internal KV Cache management"
+            )
+            cached_outputs = cached_outs
     else:
         input_ids_length = None
         sequence_length = None
@@ -367,6 +386,7 @@ def benchmark_model(
             num_streams=num_streams,
             scheduler=scheduler,
             input_shapes=input_shapes,
+            cached_outputs=cached_outputs,
         )
     elif engine == ORT_ENGINE:
         model = ORTEngine(
@@ -409,6 +429,7 @@ def benchmark_model(
         seconds_to_run=time,
         seconds_to_warmup=warmup_time,
         num_streams=num_streams,
+        internal_kv_cache=cached_outputs,
     )
     export_dict = {
         "engine": str(model),
@@ -438,7 +459,6 @@ def benchmark_model(
 
 
 def main():
-
     args = parse_args()
 
     result = benchmark_model(
@@ -454,6 +474,7 @@ def main():
         input_ids_length=args.input_ids_length,
         thread_pinning=args.thread_pinning,
         engine=args.engine,
+        internal_kv_cache=args.internal_kv_cache,
         quiet=args.quiet,
         export_path=args.export_path,
     )

diff --git a/src/deepsparse/benchmark/stream_benchmark.py b/src/deepsparse/benchmark/stream_benchmark.py
@@ -15,19 +15,41 @@
 import queue
 import threading
 import time
-from typing import Dict, List
+from typing import Any, Dict, List, NamedTuple
 
 import numpy
 
 from deepsparse import Engine
 
 
+try:
+    # flake8: noqa
+    from deepsparse.lib import init_deepsparse_lib
+except ImportError:
+    raise ImportError(
+        "Unable to import deepsparse python apis. "
+        "Please contact support@neuralmagic.com"
+    )
+
+
 __all__ = ["model_stream_benchmark"]
 
 
+LIB = init_deepsparse_lib()
+
+
+class _InputAndCache(NamedTuple):
+    input: List[numpy.ndarray]
+    kv_cache: Any
+
+
 def iteration(model: Engine, input: List[numpy.ndarray]):
     start = time.perf_counter()
-    output = model.run(input, val_inp=False)
+    if not isinstance(input, _InputAndCache):
+        output = model.run(input, val_inp=False)
+    else:
+        # run with internal kv cache object
+        output = model._eng_net.execute_list_out(input.input, input.kv_cache)
     end = time.perf_counter()
     return output, start, end
 
@@ -96,8 +118,16 @@ def model_stream_benchmark(
     seconds_to_run: float,
     seconds_to_warmup: float,
     num_streams: int,
+    internal_kv_cache: bool = False,
 ) -> Dict:
 
+    if internal_kv_cache:
+        kv_cache = LIB.kv_cache(0, 0)  # fake KV cache object
+        input_list = _adjust_input_list_for_internal_kv_cache(
+            input_list, model.input_names
+        )
+        input_list = _InputAndCache(input=input_list, kv_cache=kv_cache)
+
     # Run the benchmark scenario and collect batch times. The engine will be warmed up
     # for a few seconds first using "seconds_to_warmup"
     if scenario == "singlestream":
@@ -153,3 +183,15 @@ def model_stream_benchmark(
         **percentiles_dict,
     }
     return benchmark_dict
+
+
+def _adjust_input_list_for_internal_kv_cache(input_list, input_names):
+    # if detecting a cached input (using 'past_key_values'),
+    # set the sample input size to effective 0 for internal cache benchmarking
+    updated_inputs = []
+    for name, inputs in zip(input_names, input_list):
+        if name.startswith("past_key_values"):
+            # set batch dim to 0 to match pipeline execution
+            inputs = numpy.zeros_like(inputs, shape=(0, *inputs.shape[1:]))
+        updated_inputs.append(inputs)
+    return updated_inputs