Skip to content

Commit

Permalink
Automatically analyze in auto-regressive setting (#1212)
Browse files Browse the repository at this point in the history
* Automatically analyze in auto-regressive setting

* Update help

* Review

* Quality
  • Loading branch information
mgoin committed Aug 28, 2023
1 parent a70fee1 commit 05b4ea9
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 54 deletions.
63 changes: 36 additions & 27 deletions src/deepsparse/benchmark/benchmark_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,40 @@
##########
Command help:
usage: deepsparse.benchmark [-h] [-b BATCH_SIZE] [-shapes INPUT_SHAPES]
[-ncores NUM_CORES] [-s {async,sync,elastic}]
[-t TIME] [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
[-pin {none,core,numa}]
[-e {deepsparse,onnxruntime}] [-q]
usage: deepsparse.benchmark [-h] [-b BATCH_SIZE] [-seq_len SEQUENCE_LENGTH]
[-input_ids_len INPUT_IDS_LENGTH]
[-i INPUT_SHAPES] [-ncores NUM_CORES]
[-s {async,sync,elastic}] [-t TIME]
[-w WARMUP_TIME] [-nstreams NUM_STREAMS]
[-pin {none,core,numa}] [-e ENGINE] [-q]
[-x EXPORT_PATH]
model_path
Benchmark ONNX models in the DeepSparse Engine
positional arguments:
model_path Path to an ONNX model file or SparseZoo model stub.
model_path Path to an ONNX model file or SparseZoo model stub
optional arguments:
-h, --help show this help message and exit.
-h, --help show this help message and exit
-b BATCH_SIZE, --batch_size BATCH_SIZE
The batch size to run the analysis for. Must be
greater than 0.
-shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
greater than 0
-seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
The sequence length to run the KV cache supported
model benchmarks for. Must be greater than 0, default
is 2048
-input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
The input ids length to run the KV cache supported
model benchmarks for. Must be greater than 0, default
is 1
-i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
Override the shapes of the inputs, i.e. -shapes
"[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
input1=[4,5,6] input2=[7,8,9].
input1=[4,5,6] input2=[7,8,9]
-ncores NUM_CORES, --num_cores NUM_CORES
The number of physical cores to run the analysis on,
defaults to all physical cores available on the system.
defaults to all physical cores available on the system
-s {async,sync,elastic}, --scenario {async,sync,elastic}
Choose between using the async, sync and elastic
scenarios. Sync and async are similar to the single-
Expand All @@ -62,13 +71,18 @@
-pin {none,core,numa}, --thread_pinning {none,core,numa}
Enable binding threads to cores ('core' the default),
threads to cores on sockets ('numa'), or disable
('none').
-e {deepsparse,onnxruntime}, --engine {deepsparse,onnxruntime}
('none')
-e ENGINE, --engine ENGINE
Inference engine backend to run eval on. Choices are
'deepsparse', 'onnxruntime'. Default is 'deepsparse'.
-q, --quiet Lower logging verbosity.
Can also specify a user defined engine class by giving
the script and class name in the following format
<path to python script>:<Engine Class name>. This
engine class will be dynamically imported during
runtime
-q, --quiet Lower logging verbosity
-x EXPORT_PATH, --export_path EXPORT_PATH
Store results into a JSON file.
Store results into a JSON file
##########
Example on a BERT from SparseZoo:
Expand All @@ -85,8 +99,7 @@
Example on a CodeGen (model with KV cache support)
from SparseZoo with input_ids_length 10 and sequence length 256:
deepsparse.benchmark \
zoo:nlg/text_generation/codegen_mono-350m/pytorch/
huggingface/bigpython_bigquery_thepile/pruned50-none
zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none \
--input_ids_length 10 --sequence_length 256
##########
Expand All @@ -97,7 +110,7 @@
Example on local ONNX model at batch size 32 with synchronous (singlestream) execution:
deepsparse.benchmark /PATH/TO/model.onnx --batch_size 32 --scenario sync
"""
""" # noqa E501

import argparse
import importlib
Expand Down Expand Up @@ -153,25 +166,21 @@ def parse_args():
default=1,
help="The batch size to run the analysis for. Must be greater than 0",
)

parser.add_argument(
"-seq_len",
"--sequence_length",
type=int,
default=2048,
help="The sequence length to run the "
"KV cache supported model benchmarks for. "
"Must be greater than 0, default is 2048",
default=512,
help="The sequence length to run the KV cache supported model "
"benchmarks for. Must be 1 <= seq_len, default is 512",
)

parser.add_argument(
"-input_ids_len",
"--input_ids_length",
type=int,
default=1,
help="The input ids length to run the "
"KV cache supported model benchmarks for. "
"Must be greater than 0, default is 1",
help="The input ids length to run the KV cache supported model "
"benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1",
)
parser.add_argument(
"-i",
Expand Down
110 changes: 83 additions & 27 deletions src/deepsparse/debug_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
##########
Command help:
usage: deepsparse.debug_analysis [-h] [-wi NUM_WARMUP_ITERATIONS]
[-bi NUM_ITERATIONS] [-ncores NUM_CORES]
[-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
[-ksf KERNEL_SPARSITY_FILE]
[--optimization OPTIMIZATION] [-i INPUT_SHAPES] [-q]
[-x EXPORT_PATH]
model_path
[-bi NUM_ITERATIONS] [-ncores NUM_CORES]
[-b BATCH_SIZE] [-ks KERNEL_SPARSITY]
[-ksf KERNEL_SPARSITY_FILE]
[--optimization OPTIMIZATION]
[-seq_len SEQUENCE_LENGTH]
[-input_ids_len INPUT_IDS_LENGTH]
[-i INPUT_SHAPES] [--use-internal-kvcache]
[--kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS]
[--kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS]
[-q] [-x EXPORT_PATH]
model_path
Analyze ONNX models in the DeepSparse Engine
Expand All @@ -49,14 +54,31 @@
Filepath to per-layer kernel sparsities JSON
--optimization OPTIMIZATION
To enable or disable optimizations (Tensor Columns)
-i INPUT_SHAPES, --input_shapes INPUT_SHAPES
-seq_len SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
The sequence length to run the KV cache supported
model benchmarks for. Must be seq_len >= 1, default is
512
-input_ids_len INPUT_IDS_LENGTH, --input_ids_length INPUT_IDS_LENGTH
The input ids length to run the KV cache supported
model benchmarks for. Must be 1 <= input_ids_len <=
seq_len, default is 1
-i INPUT_SHAPES, -shapes INPUT_SHAPES, --input_shapes INPUT_SHAPES
Override the shapes of the inputs, i.e. -shapes
"[1,2,3],[4,5,6],[7,8,9]" results in input0=[1,2,3]
input1=[4,5,6] input2=[7,8,9]
--use-internal-kvcache
Enable internal KVCache
--kv-cache-prev-num-tokens KV_CACHE_PREV_NUM_TOKENS
Internal KVCache: The amount of previous tokens that
will be read from the external KV cache on the first
inference
--kv-cache-num-frozen-tokens KV_CACHE_NUM_FROZEN_TOKENS
Internal KVCache: The amount of first tokens that we
want to keep permanently in the KV cache
-q, --quiet Lower logging verbosity
-x EXPORT_PATH, --export_path EXPORT_PATH
Store results into a JSON file
"""
""" # noqa E501

import argparse
import json
Expand All @@ -66,8 +88,10 @@
from deepsparse.utils import (
default_cached_outputs,
generate_random_inputs,
has_model_kv_cache,
model_to_path,
override_onnx_input_shapes,
overwrite_cache_model_inputs,
parse_input_shapes,
)

Expand Down Expand Up @@ -132,8 +156,25 @@ def parse_args():
type=bool,
default=True,
)
parser.add_argument(
"-seq_len",
"--sequence_length",
type=int,
default=512,
help="The sequence length to run the KV cache supported model "
"benchmarks for. Must be seq_len >= 1, default is 512",
)
parser.add_argument(
"-input_ids_len",
"--input_ids_length",
type=int,
default=1,
help="The input ids length to run the KV cache supported model "
"benchmarks for. Must be 1 <= input_ids_len <= seq_len, default is 1",
)
parser.add_argument(
"-i",
"-shapes",
"--input_shapes",
help="Override the shapes of the inputs, "
'i.e. -shapes "[1,2,3],[4,5,6],[7,8,9]" results in '
Expand All @@ -142,21 +183,24 @@ def parse_args():
default="",
)
parser.add_argument(
"--use-kvcache", help="Enable KVCache", action="store_true", default=False
"--use-internal-kvcache",
help="Enable internal KVCache",
action="store_true",
default=False,
)
parser.add_argument(
"--kv-cache-prev-num-tokens",
help="KVCache: The amount of previous tokens that will be read"
help="Internal KVCache: The amount of previous tokens that will be read"
" from the external KV cache on the first inference",
type=int,
default=None,
default=0,
)
parser.add_argument(
"--kv-cache-num-frozen-tokens",
help="KVCache: The amount of first tokens that we want to keep"
help="Internal KVCache: The amount of first tokens that we want to keep"
" permanently in the KV cache",
type=int,
default=None,
default=0,
)
parser.add_argument(
"-q",
Expand Down Expand Up @@ -307,10 +351,31 @@ def main():
orig_model_path = args.model_path
model_path = model_to_path(args.model_path)

print("Analyzing model: {}".format(orig_model_path))
print(f"Analyzing model: {orig_model_path}")

batch_size = args.batch_size

if has_model_kv_cache(model_path):
if batch_size != 1:
raise ValueError(
"Unable to run models with KV cache support "
"for batch size different than one."
"Please set batch size to 1 and try again"
)

print(
"Found model with KV cache support. "
"Benchmarking the autoregressive model with "
f"input_ids_length: {args.input_ids_length} and "
f"sequence length: {args.sequence_length}."
)

model_path, _, _ = overwrite_cache_model_inputs(
model_path=model_path,
input_ids_length=args.input_ids_length,
sequence_length=args.sequence_length,
)

if input_shapes:
with override_onnx_input_shapes(model_path, input_shapes) as tmp_path:
input_list = generate_random_inputs(tmp_path, batch_size)
Expand All @@ -319,24 +384,15 @@ def main():

kv_cache_params = None
if args.use_kvcache:
kv_cache_prev_num_tokens = 0
if args.kv_cache_prev_num_tokens is not None:
kv_cache_prev_num_tokens = args.kv_cache_prev_num_tokens

kv_cache_num_frozen_tokens = 0
if args.kv_cache_num_frozen_tokens is not None:
kv_cache_num_frozen_tokens = args.kv_cache_num_frozen_tokens

kv_cache_params = KVCacheParams(
default_cached_outputs(model_path),
kv_cache_prev_num_tokens,
kv_cache_num_frozen_tokens,
args.kv_cache_prev_num_tokens,
args.kv_cache_num_frozen_tokens,
)

print(
"Enable KVCache: prev_num_tokens = {}, num_frozen_tokens = {}".format(
kv_cache_params.prev_num_tokens, kv_cache_params.num_frozen_tokens
)
f"Enable KVCache: prev_num_tokens = {kv_cache_params.prev_num_tokens}, "
f"num_frozen_tokens = {kv_cache_params.num_frozen_tokens}"
)

result = model_debug_analysis(
Expand Down

0 comments on commit 05b4ea9

Please sign in to comment.