neuralmagic · mgoin · Aug 28, 2023 · Aug 28, 2023 · Aug 28, 2023 · Aug 28, 2023
diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
@@ -17,31 +17,40 @@
 
 ##########
 Command help:
-usage: deepsparse.benchmark [-h] [-b BATCH_SIZE] [-shapes INPUT_SHAPES]
-                            [-ncores NUM_CORES] [-s {async,sync,elastic}]
-                            [-t TIME] [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
-                            [-pin {none,core,numa}]
-                            [-e {deepsparse,onnxruntime}] [-q]
+usage: deepsparse.benchmark [-h] [-b BATCH_SIZE] [-seq_len SEQUENCE_LENGTH]
+                            [-input_ids_len INPUT_IDS_LENGTH]
+                            [-i INPUT_SHAPES] [-ncores NUM_CORES]
+                            [-s {async,sync,elastic}] [-t TIME]
+                            [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
+                            [-pin {none,core,numa}] [-e ENGINE] [-q]
                             [-x EXPORT_PATH]
                             model_path
 
 Benchmark ONNX models in the DeepSparse Engine
 
 positional arguments:
-  model_path            Path to an ONNX model file or SparseZoo model stub.
+  model_path            Path to an ONNX model file or SparseZoo model stub
 
 optional arguments:
-  -h, --help            show this help message and exit.
+  -h, --help            show this help message and exit
   -b BATCH_SIZE, --batch_size BATCH_SIZE
                         The batch size to run the analysis for. Must be
-                        greater than 0.
-  -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
+                        greater than 0
+  -seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
+                        The sequence length to run the KV cache supported
+                        model benchmarks for. Must be greater than 0, default
+                        is 2048
+  -input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
+                        The input ids length to run the KV cache supported
+                        model benchmarks for. Must be greater than 0, default
+                        is 1
+  -i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
                         Override the shapes of the inputs, i.e. -shapes
                         "[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
-                        input1=[4,5,6] input2=[7,8,9].
+                        input1=[4,5,6] input2=[7,8,9]
   -ncores NUM_CORES, --num_cores NUM_CORES
                         The number of physical cores to run the analysis on,
-                        defaults to all physical cores available on the system.
+                        defaults to all physical cores available on the system
   -s {async,sync,elastic}, --scenario {async,sync,elastic}
                         Choose between using the async, sync and elastic
                         scenarios. Sync and async are similar to the single-
@@ -62,13 +71,18 @@
   -pin {none,core,numa}, --thread_pinning {none,core,numa}
                         Enable binding threads to cores ('core' the default),
                         threads to cores on sockets ('numa'), or disable
-                        ('none').
-  -e {deepsparse,onnxruntime}, --engine {deepsparse,onnxruntime}
+                        ('none')
+  -e ENGINE, --engine ENGINE
                         Inference engine backend to run eval on. Choices are
                         'deepsparse', 'onnxruntime'. Default is 'deepsparse'.
-  -q, --quiet           Lower logging verbosity.
+                        Can also specify a user defined engine class by giving
+                        the script and class name in the following format
+                        <path to python script>:<Engine Class name>. This
+                        engine class will be dynamically imported during
+                        runtime
+  -q, --quiet           Lower logging verbosity
   -x EXPORT_PATH, --export_path EXPORT_PATH
-                        Store results into a JSON file.
+                        Store results into a JSON file
 
 ##########
 Example on a BERT from SparseZoo:
@@ -85,8 +99,7 @@
 Example on a CodeGen (model with KV cache support)
 from SparseZoo with input_ids_length 10 and sequence length 256:
 deepsparse.benchmark \
-   zoo:nlg/text_generation/codegen_mono-350m/pytorch/
-   huggingface/bigpython_bigquery_thepile/pruned50-none
+   zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none \
    --input_ids_length 10 --sequence_length 256
 
 ##########
@@ -97,7 +110,7 @@
 Example on local ONNX model at batch size 32 with synchronous (singlestream) execution:
 deepsparse.benchmark /PATH/TO/model.onnx --batch_size 32 --scenario sync
 
-"""
+"""  # noqa E501
 
 import argparse
 import importlib
@@ -153,25 +166,21 @@ def parse_args():
         default=1,
         help="The batch size to run the analysis for. Must be greater than 0",
     )
-
     parser.add_argument(
         "-seq_len",
         "--sequence_length",
         type=int,
-        default=2048,
-        help="The sequence length to run the "
-        "KV cache supported model benchmarks for. "
-        "Must be greater than 0, default is 2048",
+        default=512,
+        help="The sequence length to run the KV cache supported model "
+        "benchmarks for. Must be 1 <= seq_len, default is 512",
     )
-
     parser.add_argument(
         "-input_ids_len",
         "--input_ids_length",
         type=int,
         default=1,
-        help="The input ids length to run the "
-        "KV cache supported model benchmarks for. "
-        "Must be greater than 0, default is 1",
+        help="The input ids length to run the KV cache supported model "
+        "benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1",
     )
     parser.add_argument(
         "-i",

diff --git a/src/deepsparse/debug_analysis.py b/src/deepsparse/debug_analysis.py
@@ -18,12 +18,17 @@
 ##########
 Command help:
 usage: deepsparse.debug_analysis [-h] [-wi NUM_WARMUP_ITERATIONS]
-                          [-bi NUM_ITERATIONS] [-ncores NUM_CORES]
-                          [-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
-                          [-ksf KERNEL_SPARSITY_FILE]
-                          [--optimization OPTIMIZATION] [-i INPUT_SHAPES] [-q]
-                          [-x EXPORT_PATH]
-                          model_path
+                                 [-bi NUM_ITERATIONS] [-ncores NUM_CORES]
+                                 [-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
+                                 [-ksf KERNEL_SPARSITY_FILE]
+                                 [--optimization OPTIMIZATION]
+                                 [-seq_len SEQUENCE_LENGTH]
+                                 [-input_ids_len INPUT_IDS_LENGTH]
+                                 [-i INPUT_SHAPES] [--use-internal-kvcache]
+                                 [--kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS]
+                                 [--kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS]
+                                 [-q] [-x EXPORT_PATH]
+                                 model_path
 
 Analyze ONNX models in the DeepSparse Engine
 
@@ -49,14 +54,31 @@
                         Filepath to per-layer kernel sparsities JSON
   --optimization OPTIMIZATION
                         To enable or disable optimizations (Tensor Columns)
-  -i INPUT_SHAPES, --input_shapes INPUT_SHAPES
+  -seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
+                        The sequence length to run the KV cache supported
+                        model benchmarks for. Must be seq_len >= 1, default is
+                        512
+  -input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
+                        The input ids length to run the KV cache supported
+                        model benchmarks for. Must be 1 <= input_ids_len <=
+                        seq_len, default is 1
+  -i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
                         Override the shapes of the inputs, i.e. -shapes
                         "[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
                         input1=[4,5,6] input2=[7,8,9]
+  --use-internal-kvcache
+                        Enable internal KVCache
+  --kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS
+                        Internal KVCache: The amount of previous tokens that
+                        will be read from the external KV cache on the first
+                        inference
+  --kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS
+                        Internal KVCache: The amount of first tokens that we
+                        want to keep permanently in the KV cache
   -q, --quiet           Lower logging verbosity
   -x EXPORT_PATH, --export_path EXPORT_PATH
                         Store results into a JSON file
-"""
+"""  # noqa E501
 
 import argparse
 import json
@@ -66,8 +88,10 @@
 from deepsparse.utils import (
     default_cached_outputs,
     generate_random_inputs,
+    has_model_kv_cache,
     model_to_path,
     override_onnx_input_shapes,
+    overwrite_cache_model_inputs,
     parse_input_shapes,
 )
 
@@ -132,8 +156,25 @@ def parse_args():
         type=bool,
         default=True,
     )
+    parser.add_argument(
+        "-seq_len",
+        "--sequence_length",
+        type=int,
+        default=512,
+        help="The sequence length to run the KV cache supported model "
+        "benchmarks for. Must be seq_len >= 1, default is 512",
+    )
+    parser.add_argument(
+        "-input_ids_len",
+        "--input_ids_length",
+        type=int,
+        default=1,
+        help="The input ids length to run the KV cache supported model "
+        "benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1",
+    )
     parser.add_argument(
         "-i",
+        "-shapes",
         "--input_shapes",
         help="Override the shapes of the inputs, "
         'i.e. -shapes "[1,2,3],[4,5,6],[7,8,9]" results in '
@@ -142,21 +183,24 @@ def parse_args():
         default="",
     )
     parser.add_argument(
-        "--use-kvcache", help="Enable KVCache", action="store_true", default=False
+        "--use-internal-kvcache",
+        help="Enable internal KVCache",
+        action="store_true",
+        default=False,
     )
     parser.add_argument(
         "--kv-cache-prev-num-tokens",
-        help="KVCache: The amount of previous tokens that will be read"
+        help="Internal KVCache: The amount of previous tokens that will be read"
         " from the external KV cache on the first inference",
         type=int,
-        default=None,
+        default=0,
     )
     parser.add_argument(
         "--kv-cache-num-frozen-tokens",
-        help="KVCache: The amount of first tokens that we want to keep"
+        help="Internal KVCache: The amount of first tokens that we want to keep"
         " permanently in the KV cache",
         type=int,
-        default=None,
+        default=0,
     )
     parser.add_argument(
         "-q",
@@ -307,10 +351,31 @@ def main():
     orig_model_path = args.model_path
     model_path = model_to_path(args.model_path)
 
-    print("Analyzing model: {}".format(orig_model_path))
+    print(f"Analyzing model: {orig_model_path}")
 
     batch_size = args.batch_size
 
+    if has_model_kv_cache(model_path):
+        if batch_size != 1:
+            raise ValueError(
+                "Unable to run models with KV cache support "
+                "for batch size different than one."
+                "Please set batch size to 1 and try again"
+            )
+
+        print(
+            "Found model with KV cache support. "
+            "Benchmarking the autoregressive model with "
+            f"input_ids_length: {args.input_ids_length} and "
+            f"sequence length: {args.sequence_length}."
+        )
+
+        model_path, _, _ = overwrite_cache_model_inputs(
+            model_path=model_path,
+            input_ids_length=args.input_ids_length,
+            sequence_length=args.sequence_length,
+        )
+
     if input_shapes:
         with override_onnx_input_shapes(model_path, input_shapes) as tmp_path:
             input_list = generate_random_inputs(tmp_path, batch_size)
@@ -319,24 +384,15 @@ def main():
 
     kv_cache_params = None
     if args.use_kvcache:
-        kv_cache_prev_num_tokens = 0
-        if args.kv_cache_prev_num_tokens is not None:
-            kv_cache_prev_num_tokens = args.kv_cache_prev_num_tokens
-
-        kv_cache_num_frozen_tokens = 0
-        if args.kv_cache_num_frozen_tokens is not None:
-            kv_cache_num_frozen_tokens = args.kv_cache_num_frozen_tokens
-
         kv_cache_params = KVCacheParams(
             default_cached_outputs(model_path),
-            kv_cache_prev_num_tokens,
-            kv_cache_num_frozen_tokens,
+            args.kv_cache_prev_num_tokens,
+            args.kv_cache_num_frozen_tokens,
         )
 
         print(
-            "Enable KVCache: prev_num_tokens = {}, num_frozen_tokens = {}".format(
-                kv_cache_params.prev_num_tokens, kv_cache_params.num_frozen_tokens
-            )
+            f"Enable KVCache: prev_num_tokens = {kv_cache_params.prev_num_tokens}, "
+            f"num_frozen_tokens = {kv_cache_params.num_frozen_tokens}"
         )
 
     result = model_debug_analysis(