Skip to content

Commit

Permalink
[deepsparse.benchmark] enable benchmarking internal kv cache (#1308)
Browse files Browse the repository at this point in the history
* [deepsparse.bechmark] enable benchmarking internal kv cache

* Add last bit

---------

Co-authored-by: mgoin <michael@neuralmagic.com>
  • Loading branch information
bfineran and mgoin committed Oct 11, 2023
1 parent 639e9e4 commit 428d4b5
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
25 changes: 23 additions & 2 deletions src/deepsparse/benchmark/benchmark_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,17 @@ def parse_args():
"engine class will be dynamically imported during runtime"
),
)
parser.add_argument(
"--internal-kv-cache",
"--internal_kv_cache",
help=(
"DeepSparse engine only - If True, and a model with KV cache, "
"KV Cache state will be managed within the compiled deepsparse "
"model. This is preferred when applicable for best performance"
),
action="store_true",
default=False,
)
parser.add_argument(
"-q",
"--quiet",
Expand Down Expand Up @@ -314,6 +325,7 @@ def benchmark_model(
input_ids_length: Optional[int] = 1,
thread_pinning: str = "core",
engine: str = DEEPSPARSE_ENGINE,
internal_kv_cache: bool = False,
quiet: bool = False,
export_path: Optional[str] = None,
) -> Dict:
Expand All @@ -332,6 +344,7 @@ def benchmark_model(
orig_model_path = model_path
model_path = model_to_path(model_path)

cached_outputs = None
if sequence_length and input_ids_length and has_model_kv_cache(model_path):
if input_ids_length > sequence_length:
raise ValueError(
Expand All @@ -346,12 +359,18 @@ def benchmark_model(
f"sequence length: {sequence_length}."
)

model_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
model_path, cached_outs, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
onnx_file_path=model_path,
input_ids_length=input_ids_length,
sequence_length=sequence_length,
batch_size=batch_size,
)

if internal_kv_cache:
_LOGGER.info(
"Benchmarking DeepSparse Engine with internal KV Cache management"
)
cached_outputs = cached_outs
else:
input_ids_length = None
sequence_length = None
Expand All @@ -367,6 +386,7 @@ def benchmark_model(
num_streams=num_streams,
scheduler=scheduler,
input_shapes=input_shapes,
cached_outputs=cached_outputs,
)
elif engine == ORT_ENGINE:
model = ORTEngine(
Expand Down Expand Up @@ -409,6 +429,7 @@ def benchmark_model(
seconds_to_run=time,
seconds_to_warmup=warmup_time,
num_streams=num_streams,
internal_kv_cache=cached_outputs,
)
export_dict = {
"engine": str(model),
Expand Down Expand Up @@ -438,7 +459,6 @@ def benchmark_model(


def main():

args = parse_args()

result = benchmark_model(
Expand All @@ -454,6 +474,7 @@ def main():
input_ids_length=args.input_ids_length,
thread_pinning=args.thread_pinning,
engine=args.engine,
internal_kv_cache=args.internal_kv_cache,
quiet=args.quiet,
export_path=args.export_path,
)
Expand Down
46 changes: 44 additions & 2 deletions src/deepsparse/benchmark/stream_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,41 @@
import queue
import threading
import time
from typing import Dict, List
from typing import Any, Dict, List, NamedTuple

import numpy

from deepsparse import Engine


try:
# flake8: noqa
from deepsparse.lib import init_deepsparse_lib
except ImportError:
raise ImportError(
"Unable to import deepsparse python apis. "
"Please contact support@neuralmagic.com"
)


__all__ = ["model_stream_benchmark"]


LIB = init_deepsparse_lib()


class _InputAndCache(NamedTuple):
input: List[numpy.ndarray]
kv_cache: Any


def iteration(model: Engine, input: List[numpy.ndarray]):
start = time.perf_counter()
output = model.run(input, val_inp=False)
if not isinstance(input, _InputAndCache):
output = model.run(input, val_inp=False)
else:
# run with internal kv cache object
output = model._eng_net.execute_list_out(input.input, input.kv_cache)
end = time.perf_counter()
return output, start, end

Expand Down Expand Up @@ -96,8 +118,16 @@ def model_stream_benchmark(
seconds_to_run: float,
seconds_to_warmup: float,
num_streams: int,
internal_kv_cache: bool = False,
) -> Dict:

if internal_kv_cache:
kv_cache = LIB.kv_cache(0, 0) # fake KV cache object
input_list = _adjust_input_list_for_internal_kv_cache(
input_list, model.input_names
)
input_list = _InputAndCache(input=input_list, kv_cache=kv_cache)

# Run the benchmark scenario and collect batch times. The engine will be warmed up
# for a few seconds first using "seconds_to_warmup"
if scenario == "singlestream":
Expand Down Expand Up @@ -153,3 +183,15 @@ def model_stream_benchmark(
**percentiles_dict,
}
return benchmark_dict


def _adjust_input_list_for_internal_kv_cache(input_list, input_names):
# if detecting a cached input (using 'past_key_values'),
# set the sample input size to effective 0 for internal cache benchmarking
updated_inputs = []
for name, inputs in zip(input_names, input_list):
if name.startswith("past_key_values"):
# set batch dim to 0 to match pipeline execution
inputs = numpy.zeros_like(inputs, shape=(0, *inputs.shape[1:]))
updated_inputs.append(inputs)
return updated_inputs

0 comments on commit 428d4b5

Please sign in to comment.