Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[deepsparse.benchmark] enable benchmarking internal kv cache #1308

Merged
merged 2 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions src/deepsparse/benchmark/benchmark_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,17 @@ def parse_args():
"engine class will be dynamically imported during runtime"
),
)
parser.add_argument(
"--internal-kv-cache",
"--internal_kv_cache",
help=(
"DeepSparse engine only - If True, and a model with KV cache, "
"KV Cache state will be managed within the compiled deepsparse "
"model. This is preferred when applicable for best performance"
),
action="store_true",
default=False,
)
parser.add_argument(
"-q",
"--quiet",
Expand Down Expand Up @@ -314,6 +325,7 @@ def benchmark_model(
input_ids_length: Optional[int] = 1,
thread_pinning: str = "core",
engine: str = DEEPSPARSE_ENGINE,
internal_kv_cache: bool = False,
quiet: bool = False,
export_path: Optional[str] = None,
) -> Dict:
Expand All @@ -332,6 +344,7 @@ def benchmark_model(
orig_model_path = model_path
model_path = model_to_path(model_path)

cached_outputs = None
if sequence_length and input_ids_length and has_model_kv_cache(model_path):
if input_ids_length > sequence_length:
raise ValueError(
Expand All @@ -346,12 +359,18 @@ def benchmark_model(
f"sequence length: {sequence_length}."
)

model_path, _, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
model_path, cached_outs, _ = overwrite_onnx_model_inputs_for_kv_cache_models(
onnx_file_path=model_path,
input_ids_length=input_ids_length,
sequence_length=sequence_length,
batch_size=batch_size,
)

if internal_kv_cache:
_LOGGER.info(
"Benchmarking DeepSparse Engine with internal KV Cache management"
)
cached_outputs = cached_outs
else:
input_ids_length = None
sequence_length = None
Expand All @@ -367,6 +386,7 @@ def benchmark_model(
num_streams=num_streams,
scheduler=scheduler,
input_shapes=input_shapes,
cached_outputs=cached_outputs,
)
elif engine == ORT_ENGINE:
model = ORTEngine(
Expand Down Expand Up @@ -409,6 +429,7 @@ def benchmark_model(
seconds_to_run=time,
seconds_to_warmup=warmup_time,
num_streams=num_streams,
internal_kv_cache=cached_outputs,
)
export_dict = {
"engine": str(model),
Expand Down Expand Up @@ -438,7 +459,6 @@ def benchmark_model(


def main():

args = parse_args()

result = benchmark_model(
Expand All @@ -454,6 +474,7 @@ def main():
input_ids_length=args.input_ids_length,
thread_pinning=args.thread_pinning,
engine=args.engine,
internal_kv_cache=args.internal_kv_cache,
quiet=args.quiet,
export_path=args.export_path,
)
Expand Down
46 changes: 44 additions & 2 deletions src/deepsparse/benchmark/stream_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,41 @@
import queue
import threading
import time
from typing import Dict, List
from typing import Any, Dict, List, NamedTuple

import numpy

from deepsparse import Engine


try:
# flake8: noqa
from deepsparse.lib import init_deepsparse_lib
except ImportError:
raise ImportError(
"Unable to import deepsparse python apis. "
"Please contact support@neuralmagic.com"
)


__all__ = ["model_stream_benchmark"]


LIB = init_deepsparse_lib()


class _InputAndCache(NamedTuple):
input: List[numpy.ndarray]
kv_cache: Any


def iteration(model: Engine, input: List[numpy.ndarray]):
start = time.perf_counter()
output = model.run(input, val_inp=False)
if not isinstance(input, _InputAndCache):
output = model.run(input, val_inp=False)
else:
# run with internal kv cache object
output = model._eng_net.execute_list_out(input.input, input.kv_cache)
end = time.perf_counter()
return output, start, end

Expand Down Expand Up @@ -96,8 +118,16 @@ def model_stream_benchmark(
seconds_to_run: float,
seconds_to_warmup: float,
num_streams: int,
internal_kv_cache: bool = False,
) -> Dict:

if internal_kv_cache:
kv_cache = LIB.kv_cache(0, 0) # fake KV cache object
input_list = _adjust_input_list_for_internal_kv_cache(
input_list, model.input_names
)
input_list = _InputAndCache(input=input_list, kv_cache=kv_cache)

# Run the benchmark scenario and collect batch times. The engine will be warmed up
# for a few seconds first using "seconds_to_warmup"
if scenario == "singlestream":
Expand Down Expand Up @@ -153,3 +183,15 @@ def model_stream_benchmark(
**percentiles_dict,
}
return benchmark_dict


def _adjust_input_list_for_internal_kv_cache(input_list, input_names):
# if detecting a cached input (using 'past_key_values'),
# set the sample input size to effective 0 for internal cache benchmarking
updated_inputs = []
for name, inputs in zip(input_names, input_list):
if name.startswith("past_key_values"):
# set batch dim to 0 to match pipeline execution
inputs = numpy.zeros_like(inputs, shape=(0, *inputs.shape[1:]))
updated_inputs.append(inputs)
return updated_inputs
Loading