From bff0e0343218fd61b02074d6f0ccc1d36297ee9c Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 21 Jul 2023 17:30:10 -0400
Subject: [PATCH 01/37] WIP pipeline benchmark script

---
 .../benchmark/benchmark_pipeline.py           | 474 ++++++++++++++++++
 1 file changed, 474 insertions(+)
 create mode 100644 src/deepsparse/benchmark/benchmark_pipeline.py

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
new file mode 100644
index 0000000000..c33c546130
--- /dev/null
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -0,0 +1,474 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Benchmarking script ONNX models in a DeepSparse pipeline
+
+##########
+Command help:
+usage: deepsparse.benchmark_pipeline [-h] [-b BATCH_SIZE] [-input_config INPUT_CONFIG]
+                            [-ncores NUM_CORES] [-s {async,sync,elastic}]
+                            [-t TIME] [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
+                            [-pin {none,core,numa}]
+                            [-e {deepsparse,onnxruntime}] [-q]
+                            [-x EXPORT_PATH]
+                            model_path
+
+Benchmark ONNX models in a DeepSparse pipeline
+
+positional arguments:
+  model_path            Path to an ONNX model file or SparseZoo model stub.
+
+optional arguments:
+  -h, --help            show this help message and exit.
+  -b BATCH_SIZE, --batch_size BATCH_SIZE
+                        The batch size to run the analysis for. Must be
+                        greater than 0.
+  -input_config INPUT_CONFIG
+                        JSON file containing schema for input data.
+  -ncores NUM_CORES, --num_cores NUM_CORES
+                        The number of physical cores to run the analysis on,
+                        defaults to all physical cores available on the system.
+  -s {async,sync,elastic}, --scenario {async,sync,elastic}
+                        Choose between using the async, sync and elastic
+                        scenarios. Sync and async are similar to the single-
+                        stream/multi-stream scenarios. Elastic is a newer
+                        scenario that behaves similarly to the async scenario
+                        but uses a different scheduling backend. Default value
+                        is sync.
+  -t TIME, --time TIME  The number of seconds the benchmark will run. Default
+                        is 10 seconds.
+  -w WARMUP_TIME, --warmup_time WARMUP_TIME
+                        The number of seconds the benchmark will warmup before
+                        running.Default is 2 seconds.
+  -nstreams NUM_STREAMS, --num_streams NUM_STREAMS
+                        The number of streams that will submit inferences in
+                        parallel using async scenario. Default is
+                        automatically determined for given hardware and may be
+                        sub-optimal.
+  -pin {none,core,numa}, --thread_pinning {none,core,numa}
+                        Enable binding threads to cores ('core' the default),
+                        threads to cores on sockets ('numa'), or disable
+                        ('none').
+  -e {deepsparse,onnxruntime}, --engine {deepsparse,onnxruntime}
+                        Inference engine backend to run eval on. Choices are
+                        'deepsparse', 'onnxruntime'. Default is 'deepsparse'.
+  -q, --quiet           Lower logging verbosity.
+  -x EXPORT_PATH, --export_path EXPORT_PATH
+                        Store results into a JSON file.
+
+##########
+Example on a BERT from SparseZoo:
+deepsparse.benchmark \
+   zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none
+
+##########
+Example on a BERT from SparseZoo with sequence length 512:
+deepsparse.benchmark \
+   zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none \
+   --input_shapes "[1,512],[1,512],[1,512]"
+
+##########
+Example on local ONNX model:
+deepsparse.benchmark /PATH/TO/model.onnx
+
+##########
+Example on local ONNX model at batch size 32 with synchronous (singlestream) execution:
+deepsparse.benchmark /PATH/TO/model.onnx --batch_size 32 --scenario sync
+
+"""
+
+import argparse
+import importlib
+import json
+import logging
+import os
+from typing import Dict
+
+from deepsparse import Scheduler, __version__, compile_model
+from deepsparse.benchmark.ort_engine import ORTEngine
+from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
+from deepsparse.cpu import cpu_architecture
+from deepsparse.log import set_logging_level
+from deepsparse.utils import (
+    generate_random_inputs,
+    model_to_path,
+    override_onnx_input_shapes,
+    parse_input_shapes,
+)
+
+
+__all__ = ["benchmark_pipelin"]
+
+
+_LOGGER = logging.getLogger(__name__)
+
+DEEPSPARSE_ENGINE = "deepsparse"
+ORT_ENGINE = "onnxruntime"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Benchmark ONNX models in the DeepSparse Engine"
+    )
+
+    parser.add_argument(
+        "model_path",
+        type=str,
+        help="Path to an ONNX model file or SparseZoo model stub",
+    )
+
+    parser.add_argument(
+        "-b",
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size to run the analysis for. Must be greater than 0",
+    )
+    parser.add_argument(
+        "-i",
+        "--input_config",
+        type=str,
+        default="config.json",
+        help="JSON file containing schema for input data"
+    )
+    parser.add_argument(
+        "-ncores",
+        "--num_cores",
+        type=int,
+        default=cpu_architecture().num_available_physical_cores,
+        help=(
+            "The number of physical cores to run the analysis on, "
+            "defaults to all physical cores available on the system"
+        ),
+    )
+    parser.add_argument(
+        "-s",
+        "--scenario",
+        type=str,
+        default="sync",
+        choices=["async", "sync", "elastic"],
+        help=(
+            "Choose between using the async, sync and elastic scenarios. Sync and "
+            "async are similar to the single-stream/multi-stream scenarios. Elastic "
+            "is a newer scenario that behaves similarly to the async scenario "
+            "but uses a different scheduling backend. Default value is sync."
+        ),
+    )
+    parser.add_argument(
+        "-t",
+        "--time",
+        type=int,
+        default=10,
+        help="The number of seconds the benchmark will run. Default is 10 seconds.",
+    )
+    parser.add_argument(
+        "-w",
+        "--warmup_time",
+        type=int,
+        default=2,
+        help=(
+            "The number of seconds the benchmark will warmup before running."
+            "Default is 2 seconds."
+        ),
+    )
+    parser.add_argument(
+        "-nstreams",
+        "--num_streams",
+        type=int,
+        default=None,
+        help=(
+            "The number of streams that will submit inferences in parallel using "
+            "async scenario. Default is automatically determined for given hardware "
+            "and may be sub-optimal."
+        ),
+    )
+    parser.add_argument(
+        "-pin",
+        "--thread_pinning",
+        type=str,
+        default="core",
+        choices=["none", "core", "numa"],
+        help=(
+            "Enable binding threads to cores ('core' the default), "
+            "threads to cores on sockets ('numa'), or disable ('none')"
+        ),
+    )
+    parser.add_argument(
+        "-e",
+        "--engine",
+        type=str,
+        default=DEEPSPARSE_ENGINE,
+        help=(
+            "Inference engine backend to run eval on. Choices are 'deepsparse', "
+            "'onnxruntime'. Default is 'deepsparse'. Can also specify a user "
+            "defined engine class by giving the script and class name in the "
+            "following format <path to python script>:<Engine Class name>. This "
+            "engine class will be dynamically imported during runtime"
+        ),
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        help="Lower logging verbosity",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-x",
+        "--export_path",
+        help="Store results into a JSON file",
+        type=str,
+        default=None,
+    )
+
+    return parser.parse_args()
+
+
+def decide_thread_pinning(pinning_mode: str) -> None:
+    pinning_mode = pinning_mode.lower()
+    if pinning_mode in "core":
+        os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
+        _LOGGER.info("Thread pinning to cores enabled")
+    elif pinning_mode in "numa":
+        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
+        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "1"
+        _LOGGER.info("Thread pinning to socket/numa nodes enabled")
+    elif pinning_mode in "none":
+        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
+        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "0"
+        _LOGGER.info("Thread pinning disabled, performance may be sub-optimal")
+    else:
+        _LOGGER.info(
+            "Recieved invalid option for thread_pinning '{}', skipping".format(
+                pinning_mode
+            )
+        )
+
+def parse_input_config(input_config: str) -> Dict[str, object]:
+    return json.loads(input_config)
+    
+def parse_scheduler(scenario: str) -> Scheduler:
+    scenario = scenario.lower()
+    if scenario == "multistream":
+        return Scheduler.multi_stream
+    elif scenario == "singlestream":
+        return Scheduler.single_stream
+    elif scenario == "elastic":
+        return Scheduler.elastic
+    else:
+        return Scheduler.multi_stream
+
+
+def parse_scenario(scenario: str) -> str:
+    scenario = scenario.lower()
+    if scenario == "async":
+        return "multistream"
+    elif scenario == "sync":
+        return "singlestream"
+    elif scenario == "elastic":
+        return "elastic"
+    else:
+        _LOGGER.info(
+            "Recieved invalid option for scenario'{}', defaulting to async".format(
+                scenario
+            )
+        )
+        return "multistream"
+
+
+def parse_num_streams(num_streams: int, num_cores: int, scenario: str):
+    # If model.num_streams is set, and the scenario is either "multi_stream" or
+    # "elastic", use the value of num_streams given to us by the model, otherwise
+    # use a semi-sane default value.
+    if scenario == "sync" or scenario == "singlestream":
+        if num_streams and num_streams > 1:
+            _LOGGER.info("num_streams reduced to 1 for singlestream scenario.")
+        return 1
+    else:
+        if num_streams:
+            return num_streams
+        else:
+            default_num_streams = max(1, int(num_cores / 2))
+            _LOGGER.info(
+                "num_streams default value chosen of {}. "
+                "This requires tuning and may be sub-optimal".format(
+                    default_num_streams
+                )
+            )
+            return default_num_streams
+
+
+def load_custom_engine(custom_engine_identifier: str):
+    """
+    import a custom engine based off the specified `custom_engine_identifier`
+    from user specified script
+
+    :param custom_engine_identifier: string in the form of
+           '<path_to_the_python_script>:<custom_engine_class_name>
+    :return: custom engine class object
+    """
+    path, engine_object_name = custom_engine_identifier.split(":")
+    spec = importlib.util.spec_from_file_location("user_defined_custom_engine", path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return getattr(module, engine_object_name)
+
+
+def benchmark_pipeline(
+    model_path: str,
+    batch_size: int = 1,
+    input_config: str = "",
+    num_cores: int = None,
+    scenario: str = "sync",
+    time: int = 10,
+    warmup_time: int = 2,
+    num_streams: int = None,
+    thread_pinning: str = "core",
+    engine: str = DEEPSPARSE_ENGINE,
+    quiet: bool = False,
+    export_path: str = None,
+) -> Dict:
+    if quiet:
+        set_logging_level(logging.WARN)
+
+    if num_cores is None:
+        num_cores = cpu_architecture().num_available_physical_cores
+
+    decide_thread_pinning(thread_pinning)
+
+    scenario = parse_scenario(scenario.lower())
+    scheduler = parse_scheduler(scenario)
+    input_config = parse_input_config(input_config)
+
+    orig_model_path = model_path
+    model_path = model_to_path(model_path)
+    num_streams = parse_num_streams(num_streams, num_cores, scenario)
+
+    # Compile the ONNX into a runnable model
+    if engine == DEEPSPARSE_ENGINE:
+        model = compile_model(
+            model=model_path,
+            batch_size=batch_size,
+            num_cores=num_cores,
+            num_streams=num_streams,
+            scheduler=scheduler,
+            input_shapes=input_shapes,
+        )
+    elif engine == ORT_ENGINE:
+        model = ORTEngine(
+            model=model_path,
+            batch_size=batch_size,
+            num_cores=num_cores,
+            input_shapes=input_shapes,
+        )
+    elif ":" in engine:
+        engine = load_custom_engine(custom_engine_identifier=engine)
+        model = engine(
+            model_path=model_path,
+            batch_size=batch_size,
+            num_cores=num_cores,
+        )
+    else:
+        raise ValueError(f"Invalid engine choice '{engine}'")
+    _LOGGER.info(model)
+
+    # Generate random inputs to feed the model
+    # TODO(mgoin): should be able to query Engine class instead of loading ONNX
+    if input_shapes:
+        with override_onnx_input_shapes(model_path, input_shapes) as model_path:
+            input_list = generate_random_inputs(model_path, batch_size)
+    elif hasattr(model, "generate_random_inputs"):
+        input_list = model.generate_random_inputs()
+    elif hasattr(engine, "generate_random_inputs"):
+        input_list = engine.generate_random_inputs(batch_size=batch_size)
+    else:
+        input_list = generate_random_inputs(model_path, batch_size)
+
+    # Benchmark
+    _LOGGER.info(
+        "Starting '{}' performance measurements for {} seconds".format(scenario, time)
+    )
+    benchmark_result = model_stream_benchmark(
+        model,
+        input_list,
+        scenario=scenario,
+        seconds_to_run=time,
+        seconds_to_warmup=warmup_time,
+        num_streams=num_streams,
+    )
+
+    export_dict = {
+        "engine": str(model),
+        "version": __version__,
+        "orig_model_path": orig_model_path,
+        "model_path": model_path,
+        "batch_size": batch_size,
+        "input_config": input_config,
+        "num_cores": num_cores,
+        "scenario": scenario,
+        "scheduler": str(model.scheduler),
+        "seconds_to_run": time,
+        "num_streams": num_streams,
+        "benchmark_result": benchmark_result,
+        "fraction_of_supported_ops": getattr(model, "fraction_of_supported_ops", None),
+    }
+
+    # Export results
+    if export_path:
+        _LOGGER.info("Saving benchmark results to JSON file at {}".format(export_path))
+        with open(export_path, "w") as out:
+            json.dump(export_dict, out, indent=2)
+
+    return export_dict
+
+
+def main():
+
+    args = parse_args()
+
+    result = benchmark_pipeline(
+        model_path=args.model_path,
+        batch_size=args.batch_size,
+        input_config=args.input_config,
+        num_cores=args.num_cores,
+        scenario=args.scenario,
+        time=args.time,
+        warmup_time=args.warmup_time,
+        num_streams=args.num_streams,
+        thread_pinning=args.thread_pinning,
+        engine=args.engine,
+        quiet=args.quiet,
+        export_path=args.export_path,
+    )
+
+    # Results summary
+    print("Original Model Path: {}".format(args.model_path))
+    print("Batch Size: {}".format(args.batch_size))
+    print("Scenario: {}".format(args.scenario))
+    print(
+        "Throughput (items/sec): {:.4f}".format(
+            result["benchmark_result"]["items_per_sec"]
+        )
+    )
+    print("Latency Mean (ms/batch): {:.4f}".format(result["benchmark_result"]["mean"]))
+    print(
+        "Latency Median (ms/batch): {:.4f}".format(result["benchmark_result"]["median"])
+    )
+    print("Latency Std (ms/batch): {:.4f}".format(result["benchmark_result"]["std"]))
+    print("Iterations: {}".format(int(result["benchmark_result"]["iterations"])))
+
+
+if __name__ == "__main__":
+    main()

From e26eaa72f297082a99378527fb8fbfcc2b8dde38 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 24 Jul 2023 10:14:22 -0400
Subject: [PATCH 02/37] simple script

---
 src/deepsparse/benchmark/pipeline_sandbox.py | 62 ++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 src/deepsparse/benchmark/pipeline_sandbox.py

diff --git a/src/deepsparse/benchmark/pipeline_sandbox.py b/src/deepsparse/benchmark/pipeline_sandbox.py
new file mode 100644
index 0000000000..3885c90d44
--- /dev/null
+++ b/src/deepsparse/benchmark/pipeline_sandbox.py
@@ -0,0 +1,62 @@
+import argparse
+import json
+import random
+import string
+
+from deepsparse.pipeline import Pipeline
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Benchmark DeepSparse Pipelines"
+    )
+    parser.add_argument(
+        "task_name",
+        type=str
+    )
+    parser.add_argument(
+        "model_path",
+        type=str
+    )
+    parser.add_argument(
+        "-i",
+        "--input_type",
+        type=str,
+        default="dummy",
+        choices=["dummy", "real"],
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        default="config.json",
+    )
+
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    config_file = open(args.config)
+    config = json.load(config_file)
+    config_file.close()
+
+    task_name = args.task_name
+    model_path = args.model_path
+
+    data_length = config['length']
+    num_examples = config['num_examples']
+    examples = []
+    if config['input_data_type'] == "string":
+        for _ in range(num_examples):
+            rand_string = ''.join(random.choices(string.printable, k=data_length))
+            examples.append(rand_string)
+    print(examples)
+
+    pipeline = Pipeline.create(task=task_name, model_path=model_path)
+    output = pipeline(examples)
+    print(output)
+    print(pipeline.timer_manger)
+    print(pipeline.timer_manager.stages)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 773229639028be571d063879a30ed19a35529945 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 24 Jul 2023 13:53:24 -0400
Subject: [PATCH 03/37] share code and cleanup

---
 src/deepsparse/benchmark/benchmark_model.py   |  86 +----
 .../benchmark/benchmark_pipeline.py           | 360 +++---------------
 src/deepsparse/benchmark/helpers.py           |  78 ++++
 src/deepsparse/benchmark/pipeline_sandbox.py  |  62 ---
 4 files changed, 146 insertions(+), 440 deletions(-)
 create mode 100644 src/deepsparse/benchmark/helpers.py
 delete mode 100644 src/deepsparse/benchmark/pipeline_sandbox.py

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 0bef7c57ed..ea280cc809 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -98,7 +98,7 @@
 import os
 from typing import Dict
 
-from deepsparse import Scheduler, __version__, compile_model
+from deepsparse import __version__, compile_model
 from deepsparse.benchmark.ort_engine import ORTEngine
 from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
 from deepsparse.cpu import cpu_architecture
@@ -109,6 +109,12 @@
     override_onnx_input_shapes,
     parse_input_shapes,
 )
+from deepsparse.benchmark.helpers import (
+    decide_thread_pinning,
+    parse_scheduler,
+    parse_scenario,
+    parse_num_streams
+)
 
 
 __all__ = ["benchmark_model"]
@@ -241,78 +247,6 @@ def parse_args():
     return parser.parse_args()
 
 
-def decide_thread_pinning(pinning_mode: str) -> None:
-    pinning_mode = pinning_mode.lower()
-    if pinning_mode in "core":
-        os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
-        _LOGGER.info("Thread pinning to cores enabled")
-    elif pinning_mode in "numa":
-        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
-        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "1"
-        _LOGGER.info("Thread pinning to socket/numa nodes enabled")
-    elif pinning_mode in "none":
-        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
-        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "0"
-        _LOGGER.info("Thread pinning disabled, performance may be sub-optimal")
-    else:
-        _LOGGER.info(
-            "Recieved invalid option for thread_pinning '{}', skipping".format(
-                pinning_mode
-            )
-        )
-
-
-def parse_scheduler(scenario: str) -> Scheduler:
-    scenario = scenario.lower()
-    if scenario == "multistream":
-        return Scheduler.multi_stream
-    elif scenario == "singlestream":
-        return Scheduler.single_stream
-    elif scenario == "elastic":
-        return Scheduler.elastic
-    else:
-        return Scheduler.multi_stream
-
-
-def parse_scenario(scenario: str) -> str:
-    scenario = scenario.lower()
-    if scenario == "async":
-        return "multistream"
-    elif scenario == "sync":
-        return "singlestream"
-    elif scenario == "elastic":
-        return "elastic"
-    else:
-        _LOGGER.info(
-            "Recieved invalid option for scenario'{}', defaulting to async".format(
-                scenario
-            )
-        )
-        return "multistream"
-
-
-def parse_num_streams(num_streams: int, num_cores: int, scenario: str):
-    # If model.num_streams is set, and the scenario is either "multi_stream" or
-    # "elastic", use the value of num_streams given to us by the model, otherwise
-    # use a semi-sane default value.
-    if scenario == "sync" or scenario == "singlestream":
-        if num_streams and num_streams > 1:
-            _LOGGER.info("num_streams reduced to 1 for singlestream scenario.")
-        return 1
-    else:
-        if num_streams:
-            return num_streams
-        else:
-            default_num_streams = max(1, int(num_cores / 2))
-            _LOGGER.info(
-                "num_streams default value chosen of {}. "
-                "This requires tuning and may be sub-optimal".format(
-                    default_num_streams
-                )
-            )
-            return default_num_streams
-
-
 def load_custom_engine(custom_engine_identifier: str):
     """
     import a custom engine based off the specified `custom_engine_identifier`
@@ -349,15 +283,15 @@ def benchmark_model(
     if num_cores is None:
         num_cores = cpu_architecture().num_available_physical_cores
 
-    decide_thread_pinning(thread_pinning)
+    decide_thread_pinning(thread_pinning, _LOGGER)
 
-    scenario = parse_scenario(scenario.lower())
+    scenario = parse_scenario(scenario.lower(), _LOGGER)
     scheduler = parse_scheduler(scenario)
     input_shapes = parse_input_shapes(input_shapes)
 
     orig_model_path = model_path
     model_path = model_to_path(model_path)
-    num_streams = parse_num_streams(num_streams, num_cores, scenario)
+    num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
 
     # Compile the ONNX into a runnable model
     if engine == DEEPSPARSE_ENGINE:
diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index c33c546130..9b27f3a6e2 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -12,91 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""
-Benchmarking script ONNX models in a DeepSparse pipeline
-
-##########
-Command help:
-usage: deepsparse.benchmark_pipeline [-h] [-b BATCH_SIZE] [-input_config INPUT_CONFIG]
-                            [-ncores NUM_CORES] [-s {async,sync,elastic}]
-                            [-t TIME] [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
-                            [-pin {none,core,numa}]
-                            [-e {deepsparse,onnxruntime}] [-q]
-                            [-x EXPORT_PATH]
-                            model_path
-
-Benchmark ONNX models in a DeepSparse pipeline
-
-positional arguments:
-  model_path            Path to an ONNX model file or SparseZoo model stub.
-
-optional arguments:
-  -h, --help            show this help message and exit.
-  -b BATCH_SIZE, --batch_size BATCH_SIZE
-                        The batch size to run the analysis for. Must be
-                        greater than 0.
-  -input_config INPUT_CONFIG
-                        JSON file containing schema for input data.
-  -ncores NUM_CORES, --num_cores NUM_CORES
-                        The number of physical cores to run the analysis on,
-                        defaults to all physical cores available on the system.
-  -s {async,sync,elastic}, --scenario {async,sync,elastic}
-                        Choose between using the async, sync and elastic
-                        scenarios. Sync and async are similar to the single-
-                        stream/multi-stream scenarios. Elastic is a newer
-                        scenario that behaves similarly to the async scenario
-                        but uses a different scheduling backend. Default value
-                        is sync.
-  -t TIME, --time TIME  The number of seconds the benchmark will run. Default
-                        is 10 seconds.
-  -w WARMUP_TIME, --warmup_time WARMUP_TIME
-                        The number of seconds the benchmark will warmup before
-                        running.Default is 2 seconds.
-  -nstreams NUM_STREAMS, --num_streams NUM_STREAMS
-                        The number of streams that will submit inferences in
-                        parallel using async scenario. Default is
-                        automatically determined for given hardware and may be
-                        sub-optimal.
-  -pin {none,core,numa}, --thread_pinning {none,core,numa}
-                        Enable binding threads to cores ('core' the default),
-                        threads to cores on sockets ('numa'), or disable
-                        ('none').
-  -e {deepsparse,onnxruntime}, --engine {deepsparse,onnxruntime}
-                        Inference engine backend to run eval on. Choices are
-                        'deepsparse', 'onnxruntime'. Default is 'deepsparse'.
-  -q, --quiet           Lower logging verbosity.
-  -x EXPORT_PATH, --export_path EXPORT_PATH
-                        Store results into a JSON file.
-
-##########
-Example on a BERT from SparseZoo:
-deepsparse.benchmark \
-   zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none
-
-##########
-Example on a BERT from SparseZoo with sequence length 512:
-deepsparse.benchmark \
-   zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none \
-   --input_shapes "[1,512],[1,512],[1,512]"
-
-##########
-Example on local ONNX model:
-deepsparse.benchmark /PATH/TO/model.onnx
-
-##########
-Example on local ONNX model at batch size 32 with synchronous (singlestream) execution:
-deepsparse.benchmark /PATH/TO/model.onnx --batch_size 32 --scenario sync
-
-"""
-
 import argparse
 import importlib
 import json
+import string
 import logging
+import random
 import os
 from typing import Dict
 
-from deepsparse import Scheduler, __version__, compile_model
+from deepsparse import Pipeline
 from deepsparse.benchmark.ort_engine import ORTEngine
 from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
 from deepsparse.cpu import cpu_architecture
@@ -107,9 +32,15 @@
     override_onnx_input_shapes,
     parse_input_shapes,
 )
+from deepsparse.benchmark.helpers import (
+    decide_thread_pinning,
+    parse_scheduler,
+    parse_scenario,
+    parse_num_streams
+)
 
 
-__all__ = ["benchmark_pipelin"]
+__all__ = ["benchmark_pipeline"]
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -120,15 +51,33 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Benchmark ONNX models in the DeepSparse Engine"
+        description="Benchmark DeepSparse Pipelines"
+    )
+    parser.add_argument(
+        "task_name",
+        type=str,
+        help="Type of pipeline to run"
     )
-
     parser.add_argument(
         "model_path",
         type=str,
         help="Path to an ONNX model file or SparseZoo model stub",
     )
-
+    parser.add_argument(
+        "-c",
+        "--input_config",
+        type=str,
+        default="config.json",
+        help="JSON file containing schema for input data"
+    )
+    parser.add_argument(
+        "-i",
+        "--input_type",
+        type=str,
+        default="dummy",
+        choices=["dummy", "real"],
+        help="Type of input data to use, real or randomly generated"
+    )
     parser.add_argument(
         "-b",
         "--batch_size",
@@ -136,13 +85,6 @@ def parse_args():
         default=1,
         help="The batch size to run the analysis for. Must be greater than 0",
     )
-    parser.add_argument(
-        "-i",
-        "--input_config",
-        type=str,
-        default="config.json",
-        help="JSON file containing schema for input data"
-    )
     parser.add_argument(
         "-ncores",
         "--num_cores",
@@ -173,16 +115,6 @@ def parse_args():
         default=10,
         help="The number of seconds the benchmark will run. Default is 10 seconds.",
     )
-    parser.add_argument(
-        "-w",
-        "--warmup_time",
-        type=int,
-        default=2,
-        help=(
-            "The number of seconds the benchmark will warmup before running."
-            "Default is 2 seconds."
-        ),
-    )
     parser.add_argument(
         "-nstreams",
         "--num_streams",
@@ -236,238 +168,62 @@ def parse_args():
     return parser.parse_args()
 
 
-def decide_thread_pinning(pinning_mode: str) -> None:
-    pinning_mode = pinning_mode.lower()
-    if pinning_mode in "core":
-        os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
-        _LOGGER.info("Thread pinning to cores enabled")
-    elif pinning_mode in "numa":
-        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
-        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "1"
-        _LOGGER.info("Thread pinning to socket/numa nodes enabled")
-    elif pinning_mode in "none":
-        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
-        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "0"
-        _LOGGER.info("Thread pinning disabled, performance may be sub-optimal")
-    else:
-        _LOGGER.info(
-            "Recieved invalid option for thread_pinning '{}', skipping".format(
-                pinning_mode
-            )
-        )
-
-def parse_input_config(input_config: str) -> Dict[str, object]:
-    return json.loads(input_config)
-    
-def parse_scheduler(scenario: str) -> Scheduler:
-    scenario = scenario.lower()
-    if scenario == "multistream":
-        return Scheduler.multi_stream
-    elif scenario == "singlestream":
-        return Scheduler.single_stream
-    elif scenario == "elastic":
-        return Scheduler.elastic
-    else:
-        return Scheduler.multi_stream
-
-
-def parse_scenario(scenario: str) -> str:
-    scenario = scenario.lower()
-    if scenario == "async":
-        return "multistream"
-    elif scenario == "sync":
-        return "singlestream"
-    elif scenario == "elastic":
-        return "elastic"
-    else:
-        _LOGGER.info(
-            "Recieved invalid option for scenario'{}', defaulting to async".format(
-                scenario
-            )
-        )
-        return "multistream"
-
-
-def parse_num_streams(num_streams: int, num_cores: int, scenario: str):
-    # If model.num_streams is set, and the scenario is either "multi_stream" or
-    # "elastic", use the value of num_streams given to us by the model, otherwise
-    # use a semi-sane default value.
-    if scenario == "sync" or scenario == "singlestream":
-        if num_streams and num_streams > 1:
-            _LOGGER.info("num_streams reduced to 1 for singlestream scenario.")
-        return 1
-    else:
-        if num_streams:
-            return num_streams
-        else:
-            default_num_streams = max(1, int(num_cores / 2))
-            _LOGGER.info(
-                "num_streams default value chosen of {}. "
-                "This requires tuning and may be sub-optimal".format(
-                    default_num_streams
-                )
-            )
-            return default_num_streams
-
-
-def load_custom_engine(custom_engine_identifier: str):
-    """
-    import a custom engine based off the specified `custom_engine_identifier`
-    from user specified script
-
-    :param custom_engine_identifier: string in the form of
-           '<path_to_the_python_script>:<custom_engine_class_name>
-    :return: custom engine class object
-    """
-    path, engine_object_name = custom_engine_identifier.split(":")
-    spec = importlib.util.spec_from_file_location("user_defined_custom_engine", path)
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return getattr(module, engine_object_name)
 
+def parse_input_config(input_config_file: str) -> Dict[str, object]:
+    config_file = open(input_config_file)
+    config = json.load(config_file)
+    config_file.close()
+    return config
 
 def benchmark_pipeline(
     model_path: str,
-    batch_size: int = 1,
-    input_config: str = "",
+    task: str,
+    input_config: str,
+    input_type: str = "dummy",
     num_cores: int = None,
     scenario: str = "sync",
     time: int = 10,
-    warmup_time: int = 2,
     num_streams: int = None,
     thread_pinning: str = "core",
     engine: str = DEEPSPARSE_ENGINE,
     quiet: bool = False,
     export_path: str = None,
 ) -> Dict:
-    if quiet:
-        set_logging_level(logging.WARN)
-
-    if num_cores is None:
-        num_cores = cpu_architecture().num_available_physical_cores
-
-    decide_thread_pinning(thread_pinning)
-
-    scenario = parse_scenario(scenario.lower())
-    scheduler = parse_scheduler(scenario)
-    input_config = parse_input_config(input_config)
-
-    orig_model_path = model_path
-    model_path = model_to_path(model_path)
-    num_streams = parse_num_streams(num_streams, num_cores, scenario)
-
-    # Compile the ONNX into a runnable model
-    if engine == DEEPSPARSE_ENGINE:
-        model = compile_model(
-            model=model_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-            num_streams=num_streams,
-            scheduler=scheduler,
-            input_shapes=input_shapes,
-        )
-    elif engine == ORT_ENGINE:
-        model = ORTEngine(
-            model=model_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-            input_shapes=input_shapes,
-        )
-    elif ":" in engine:
-        engine = load_custom_engine(custom_engine_identifier=engine)
-        model = engine(
-            model_path=model_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-        )
-    else:
-        raise ValueError(f"Invalid engine choice '{engine}'")
-    _LOGGER.info(model)
-
-    # Generate random inputs to feed the model
-    # TODO(mgoin): should be able to query Engine class instead of loading ONNX
-    if input_shapes:
-        with override_onnx_input_shapes(model_path, input_shapes) as model_path:
-            input_list = generate_random_inputs(model_path, batch_size)
-    elif hasattr(model, "generate_random_inputs"):
-        input_list = model.generate_random_inputs()
-    elif hasattr(engine, "generate_random_inputs"):
-        input_list = engine.generate_random_inputs(batch_size=batch_size)
-    else:
-        input_list = generate_random_inputs(model_path, batch_size)
-
-    # Benchmark
-    _LOGGER.info(
-        "Starting '{}' performance measurements for {} seconds".format(scenario, time)
-    )
-    benchmark_result = model_stream_benchmark(
-        model,
-        input_list,
-        scenario=scenario,
-        seconds_to_run=time,
-        seconds_to_warmup=warmup_time,
-        num_streams=num_streams,
-    )
+    
+    config = parse_input_config(input_config)
 
-    export_dict = {
-        "engine": str(model),
-        "version": __version__,
-        "orig_model_path": orig_model_path,
-        "model_path": model_path,
-        "batch_size": batch_size,
-        "input_config": input_config,
-        "num_cores": num_cores,
-        "scenario": scenario,
-        "scheduler": str(model.scheduler),
-        "seconds_to_run": time,
-        "num_streams": num_streams,
-        "benchmark_result": benchmark_result,
-        "fraction_of_supported_ops": getattr(model, "fraction_of_supported_ops", None),
-    }
+    data_length = config['length']
+    num_examples = config['num_examples']
+    examples = []
+    if config['input_data_type'] == "string":
+        for _ in range(num_examples):
+            rand_string = ''.join(random.choices(string.printable, k=data_length))
+            examples.append(rand_string)
+    print(examples)
 
-    # Export results
-    if export_path:
-        _LOGGER.info("Saving benchmark results to JSON file at {}".format(export_path))
-        with open(export_path, "w") as out:
-            json.dump(export_dict, out, indent=2)
+    pipeline = Pipeline.create(task=task, model_path=model_path)
+    output = pipeline(examples)
+    print(output)
 
-    return export_dict
+    return {}
 
 
 def main():
-
     args = parse_args()
 
     result = benchmark_pipeline(
         model_path=args.model_path,
-        batch_size=args.batch_size,
-        input_config=args.input_config,
-        num_cores=args.num_cores,
-        scenario=args.scenario,
-        time=args.time,
-        warmup_time=args.warmup_time,
-        num_streams=args.num_streams,
-        thread_pinning=args.thread_pinning,
-        engine=args.engine,
-        quiet=args.quiet,
-        export_path=args.export_path,
+        task=args.task_name,
+        input_config = args.input_config,
+        input_type = args.input_type
     )
 
     # Results summary
     print("Original Model Path: {}".format(args.model_path))
+    print("Task: {}".format(args.task_name))
+    print("Input Type: {}".format(args.input_type))
     print("Batch Size: {}".format(args.batch_size))
     print("Scenario: {}".format(args.scenario))
-    print(
-        "Throughput (items/sec): {:.4f}".format(
-            result["benchmark_result"]["items_per_sec"]
-        )
-    )
-    print("Latency Mean (ms/batch): {:.4f}".format(result["benchmark_result"]["mean"]))
-    print(
-        "Latency Median (ms/batch): {:.4f}".format(result["benchmark_result"]["median"])
-    )
-    print("Latency Std (ms/batch): {:.4f}".format(result["benchmark_result"]["std"]))
-    print("Iterations: {}".format(int(result["benchmark_result"]["iterations"])))
 
 
 if __name__ == "__main__":
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
new file mode 100644
index 0000000000..14c90653a6
--- /dev/null
+++ b/src/deepsparse/benchmark/helpers.py
@@ -0,0 +1,78 @@
+import os
+
+from deepsparse import Scheduler
+
+__all__ = [
+    "decide_thread_pinning",
+    "parse_scheduler",
+    "parse_scenario",
+    "parse_num_streams"
+]
+
+def decide_thread_pinning(pinning_mode: str, logger: object) -> None:
+    pinning_mode = pinning_mode.lower()
+    if pinning_mode in "core":
+        os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
+        logger.info("Thread pinning to cores enabled")
+    elif pinning_mode in "numa":
+        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
+        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "1"
+        logger.info("Thread pinning to socket/numa nodes enabled")
+    elif pinning_mode in "none":
+        os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
+        os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "0"
+        logger.info("Thread pinning disabled, performance may be sub-optimal")
+    else:
+        logger.info(
+            "Recieved invalid option for thread_pinning '{}', skipping".format(
+                pinning_mode
+            )
+        )
+
+def parse_scheduler(scenario: str) -> Scheduler:
+    scenario = scenario.lower()
+    if scenario == "multistream":
+        return Scheduler.multi_stream
+    elif scenario == "singlestream":
+        return Scheduler.single_stream
+    elif scenario == "elastic":
+        return Scheduler.elastic
+    else:
+        return Scheduler.multi_stream
+    
+def parse_scenario(scenario: str, logger: object) -> str:
+    scenario = scenario.lower()
+    if scenario == "async":
+        return "multistream"
+    elif scenario == "sync":
+        return "singlestream"
+    elif scenario == "elastic":
+        return "elastic"
+    else:
+        logger.info(
+            "Recieved invalid option for scenario'{}', defaulting to async".format(
+                scenario
+            )
+        )
+        return "multistream"
+    
+def parse_num_streams(num_streams: int, num_cores: int, scenario: str, logger: object):
+    # If model.num_streams is set, and the scenario is either "multi_stream" or
+    # "elastic", use the value of num_streams given to us by the model, otherwise
+    # use a semi-sane default value.
+    if scenario == "sync" or scenario == "singlestream":
+        if num_streams and num_streams > 1:
+            logger.info("num_streams reduced to 1 for singlestream scenario.")
+        return 1
+    else:
+        if num_streams:
+            return num_streams
+        else:
+            default_num_streams = max(1, int(num_cores / 2))
+            logger.info(
+                "num_streams default value chosen of {}. "
+                "This requires tuning and may be sub-optimal".format(
+                    default_num_streams
+                )
+            )
+            return default_num_streams
diff --git a/src/deepsparse/benchmark/pipeline_sandbox.py b/src/deepsparse/benchmark/pipeline_sandbox.py
deleted file mode 100644
index 3885c90d44..0000000000
--- a/src/deepsparse/benchmark/pipeline_sandbox.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import argparse
-import json
-import random
-import string
-
-from deepsparse.pipeline import Pipeline
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Benchmark DeepSparse Pipelines"
-    )
-    parser.add_argument(
-        "task_name",
-        type=str
-    )
-    parser.add_argument(
-        "model_path",
-        type=str
-    )
-    parser.add_argument(
-        "-i",
-        "--input_type",
-        type=str,
-        default="dummy",
-        choices=["dummy", "real"],
-    )
-    parser.add_argument(
-        "-c",
-        "--config",
-        type=str,
-        default="config.json",
-    )
-
-    return parser.parse_args()
-
-def main():
-    args = parse_args()
-
-    config_file = open(args.config)
-    config = json.load(config_file)
-    config_file.close()
-
-    task_name = args.task_name
-    model_path = args.model_path
-
-    data_length = config['length']
-    num_examples = config['num_examples']
-    examples = []
-    if config['input_data_type'] == "string":
-        for _ in range(num_examples):
-            rand_string = ''.join(random.choices(string.printable, k=data_length))
-            examples.append(rand_string)
-    print(examples)
-
-    pipeline = Pipeline.create(task=task_name, model_path=model_path)
-    output = pipeline(examples)
-    print(output)
-    print(pipeline.timer_manger)
-    print(pipeline.timer_manager.stages)
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file

From 956dbe8140e9c49fe08f3419798be2f9d7997e6b Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 25 Jul 2023 09:05:04 -0400
Subject: [PATCH 04/37] adding additional cmd line arguments

---
 .../benchmark/benchmark_pipeline.py           | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 9b27f3a6e2..0fcc98ddd0 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -21,6 +21,7 @@
 import os
 from typing import Dict
 
+from deepsparse import __version__, compile_model
 from deepsparse import Pipeline
 from deepsparse.benchmark.ort_engine import ORTEngine
 from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
@@ -180,6 +181,7 @@ def benchmark_pipeline(
     task: str,
     input_config: str,
     input_type: str = "dummy",
+    batch_size: int = 1,
     num_cores: int = None,
     scenario: str = "sync",
     time: int = 10,
@@ -190,6 +192,36 @@ def benchmark_pipeline(
     export_path: str = None,
 ) -> Dict:
     
+    if quiet:
+        set_logging_level(logging.WARN)
+
+    if num_cores is None:
+        num_cores = cpu_architecture().num_available_physical_cores
+
+    decide_thread_pinning(thread_pinning, _LOGGER)
+    scenario = parse_scenario(scenario.lower(), _LOGGER)
+    scheduler = parse_scheduler(scenario)
+    num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
+
+    # Compile the ONNX into a runnable model
+    if engine == DEEPSPARSE_ENGINE:
+        model = compile_model(
+            model=model_path,
+            batch_size=batch_size,
+            num_cores=num_cores,
+            num_streams=num_streams,
+            scheduler=scheduler,
+        )
+    elif engine == ORT_ENGINE:
+        model = ORTEngine(
+            model=model_path,
+            batch_size=batch_size,
+            num_cores=num_cores,
+        )
+    else:
+        raise ValueError(f"Invalid engine choice '{engine}'")
+    _LOGGER.info(model)
+    
     config = parse_input_config(input_config)
 
     data_length = config['length']
@@ -215,7 +247,16 @@ def main():
         model_path=args.model_path,
         task=args.task_name,
         input_config = args.input_config,
-        input_type = args.input_type
+        input_type = args.input_type,
+        batch_size=args.batch_size,
+        num_cores=args.num_cores,
+        scenario=args.scenario,
+        time=args.time,
+        num_streams=args.num_streams,
+        thread_pinning=args.thread_pinning,
+        engine=args.engine,
+        quiet=args.quiet,
+        export_path=args.export_path,
     )
 
     # Results summary

From 6cbc99eb4b14e5e1ddefeaccfe5ae2a91fd1088a Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 25 Jul 2023 12:13:18 -0400
Subject: [PATCH 05/37] image and text inputs

---
 .../benchmark/benchmark_pipeline.py           | 75 ++++++++++++++-----
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 0fcc98ddd0..a56cc22486 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -19,7 +19,9 @@
 import logging
 import random
 import os
-from typing import Dict
+from typing import Dict, List
+import time
+import numpy
 
 from deepsparse import __version__, compile_model
 from deepsparse import Pipeline
@@ -27,6 +29,7 @@
 from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
 from deepsparse.cpu import cpu_architecture
 from deepsparse.log import set_logging_level
+from deepsparse.utils.timer import StagedTimer
 from deepsparse.utils import (
     generate_random_inputs,
     model_to_path,
@@ -184,13 +187,13 @@ def benchmark_pipeline(
     batch_size: int = 1,
     num_cores: int = None,
     scenario: str = "sync",
-    time: int = 10,
+    seconds_to_run: int = 10,
     num_streams: int = None,
     thread_pinning: str = "core",
     engine: str = DEEPSPARSE_ENGINE,
     quiet: bool = False,
     export_path: str = None,
-) -> Dict:
+) -> List[StagedTimer]:
     
     if quiet:
         set_logging_level(logging.WARN)
@@ -223,26 +226,43 @@ def benchmark_pipeline(
     _LOGGER.info(model)
     
     config = parse_input_config(input_config)
+    pipeline = Pipeline.create(task=task, model_path=model_path)
 
-    data_length = config['length']
-    num_examples = config['num_examples']
-    examples = []
+    input_data = []
     if config['input_data_type'] == "string":
-        for _ in range(num_examples):
+        data_length = config['sequence_length']
+        for _ in range(batch_size):
             rand_string = ''.join(random.choices(string.printable, k=data_length))
-            examples.append(rand_string)
-    print(examples)
+            input_data.append(rand_string)
+        inputs = pipeline.input_schema(sequences=input_data)
+    elif config['input_data_type'] == "array":
+        image_shape = config["input_array_shape"]
+        dtype = config["input_array_dtype"]
+        for _ in range(batch_size):
+            if dtype == "uint8":
+                rand_array = numpy.random.randint(0,high=255, size=image_shape).astype(dtype)
+            rand_array = numpy.random.rand(*image_shape).astype(dtype)
+            input_data.append(rand_array)
+        inputs = pipeline.input_schema(images=input_data)
 
-    pipeline = Pipeline.create(task=task, model_path=model_path)
-    output = pipeline(examples)
-    print(output)
+    benchmark_end_time = time.perf_counter() + seconds_to_run
+    batch_timings = []
+    while time.perf_counter() < benchmark_end_time:
+        output = pipeline(inputs)
+        batch_timings.append(pipeline.timer_manager.latest)
 
-    return {}
+    return batch_timings
 
 
 def main():
     args = parse_args()
 
+    print("Original Model Path: {}".format(args.model_path))
+    print("Task: {}".format(args.task_name))
+    print("Input Type: {}".format(args.input_type))
+    print("Batch Size: {}".format(args.batch_size))
+    print("Scenario: {}".format(args.scenario))
+
     result = benchmark_pipeline(
         model_path=args.model_path,
         task=args.task_name,
@@ -251,7 +271,7 @@ def main():
         batch_size=args.batch_size,
         num_cores=args.num_cores,
         scenario=args.scenario,
-        time=args.time,
+        seconds_to_run=args.time,
         num_streams=args.num_streams,
         thread_pinning=args.thread_pinning,
         engine=args.engine,
@@ -260,11 +280,28 @@ def main():
     )
 
     # Results summary
-    print("Original Model Path: {}".format(args.model_path))
-    print("Task: {}".format(args.task_name))
-    print("Input Type: {}".format(args.input_type))
-    print("Batch Size: {}".format(args.batch_size))
-    print("Scenario: {}".format(args.scenario))
+    batches_processed = len(result)
+    total_time = sum(st.times['total_inference'] for st in result)
+    print("Processed {} batches in {} seconds".format(batches_processed, total_time))
+    throughput = round(batches_processed / total_time, 4)
+    print("Throughput: {} batches/sec".format(throughput))
+    total_pre_process = sum(st.times['pre_process'] for st in result)
+    total_post_process = sum(st.times['post_process'] for st in result)
+    total_engine_forward = sum(st.times['engine_forward'] for st in result)
+
+    avg_pre_process = round(total_pre_process / batches_processed * 1000, 4)
+    avg_post_process = round(total_post_process / batches_processed * 1000, 4)
+    avg_engine_forward = round(total_engine_forward / batches_processed * 1000, 4)
+
+    print("Average Pre-Process: {} ms".format(avg_pre_process))
+    print("Average Post-Process: {} ms".format(avg_post_process))
+    print("Average Engine Forward: {} ms".format(avg_engine_forward))
+
+    total_time = total_pre_process + total_post_process + total_engine_forward
+    percent_pre = round(total_pre_process / total_time * 100, 2)
+    percent_post = round(total_post_process / total_time * 100, 2)
+    percent_forward = round(total_engine_forward / total_time * 100, 2)
+    print("{}% Pre-processing, {}% Post-processing, {}% Inference".format(percent_pre, percent_post, percent_forward))
 
 
 if __name__ == "__main__":

From 0143d318185a7588d6dd8c2e0b9eb8629570ada9 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 25 Jul 2023 14:33:07 -0400
Subject: [PATCH 06/37] json export of statistics

---
 .../benchmark/benchmark_pipeline.py           | 199 +++++++++++-------
 1 file changed, 124 insertions(+), 75 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index a56cc22486..5440724066 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -13,29 +13,21 @@
 # limitations under the License.
 
 import argparse
-import importlib
 import json
 import string
 import logging
 import random
-import os
-from typing import Dict, List
+from typing import Dict, List, Tuple
 import time
 import numpy
+import threading
+import queue
 
-from deepsparse import __version__, compile_model
+from deepsparse import __version__
 from deepsparse import Pipeline
-from deepsparse.benchmark.ort_engine import ORTEngine
-from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
 from deepsparse.cpu import cpu_architecture
 from deepsparse.log import set_logging_level
 from deepsparse.utils.timer import StagedTimer
-from deepsparse.utils import (
-    generate_random_inputs,
-    model_to_path,
-    override_onnx_input_shapes,
-    parse_input_shapes,
-)
 from deepsparse.benchmark.helpers import (
     decide_thread_pinning,
     parse_scheduler,
@@ -141,19 +133,6 @@ def parse_args():
             "threads to cores on sockets ('numa'), or disable ('none')"
         ),
     )
-    parser.add_argument(
-        "-e",
-        "--engine",
-        type=str,
-        default=DEEPSPARSE_ENGINE,
-        help=(
-            "Inference engine backend to run eval on. Choices are 'deepsparse', "
-            "'onnxruntime'. Default is 'deepsparse'. Can also specify a user "
-            "defined engine class by giving the script and class name in the "
-            "following format <path to python script>:<Engine Class name>. This "
-            "engine class will be dynamically imported during runtime"
-        ),
-    )
     parser.add_argument(
         "-q",
         "--quiet",
@@ -171,9 +150,63 @@ def parse_args():
 
     return parser.parse_args()
 
+class PipelineExecutorThread(threading.Thread):
+    def __init__(
+        self,
+        pipeline: Pipeline,
+        inputs: List[any],
+        time_queue: queue.Queue,
+        max_time: float
+    ):
+        super(PipelineExecutorThread, self).__init__()
+        self._pipeline = pipeline
+        self._inputs = inputs
+        self._time_queue = time_queue
+        self._max_time = max_time
+
+    def run(self):
+        while time.perf_counter() < self._max_time:
+            output = self._pipeline(self._inputs)
+            self._time_queue.put(self._pipeline.timer_manager.latest)
+
+
+def singlestream_benchmark(
+    pipeline: Pipeline,
+    inputs: List[any],
+    seconds_to_run: float
+) -> List[StagedTimer]:
+    benchmark_end_time = time.perf_counter() + seconds_to_run
+    batch_timings = []
+    while time.perf_counter() < benchmark_end_time:
+        output = pipeline(inputs)
+        batch_timings.append(pipeline.timer_manager.latest)
+
+    return batch_timings
+
+def multistream_benchmark(
+    pipeline: Pipeline,
+    inputs: List[any],
+    seconds_to_run: float,
+    num_streams: int,
+) -> List[StagedTimer]:
+    time_queue = queue.Queue()
+    max_time = time.perf_counter() + seconds_to_run
+    threads = []
+
+    # Sara TODO: should these all be sharing the same pipeline?
+    for thread in range(num_streams):
+        threads.append(PipelineExecutorThread(pipeline, inputs, time_queue, max_time))
 
+    for thread in threads:
+        thread.start()
 
-def parse_input_config(input_config_file: str) -> Dict[str, object]:
+    for thread in threads:
+        thread.join()
+
+    return list(time_queue.queue)
+
+
+def parse_input_config(input_config_file: str) -> Dict[str, any]:
     config_file = open(input_config_file)
     config = json.load(config_file)
     config_file.close()
@@ -190,10 +223,9 @@ def benchmark_pipeline(
     seconds_to_run: int = 10,
     num_streams: int = None,
     thread_pinning: str = "core",
-    engine: str = DEEPSPARSE_ENGINE,
     quiet: bool = False,
     export_path: str = None,
-) -> List[StagedTimer]:
+) -> Tuple[List[StagedTimer],float] :
     
     if quiet:
         set_logging_level(logging.WARN)
@@ -205,25 +237,6 @@ def benchmark_pipeline(
     scenario = parse_scenario(scenario.lower(), _LOGGER)
     scheduler = parse_scheduler(scenario)
     num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
-
-    # Compile the ONNX into a runnable model
-    if engine == DEEPSPARSE_ENGINE:
-        model = compile_model(
-            model=model_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-            num_streams=num_streams,
-            scheduler=scheduler,
-        )
-    elif engine == ORT_ENGINE:
-        model = ORTEngine(
-            model=model_path,
-            batch_size=batch_size,
-            num_cores=num_cores,
-        )
-    else:
-        raise ValueError(f"Invalid engine choice '{engine}'")
-    _LOGGER.info(model)
     
     config = parse_input_config(input_config)
     pipeline = Pipeline.create(task=task, model_path=model_path)
@@ -245,14 +258,39 @@ def benchmark_pipeline(
             input_data.append(rand_array)
         inputs = pipeline.input_schema(images=input_data)
 
-    benchmark_end_time = time.perf_counter() + seconds_to_run
-    batch_timings = []
-    while time.perf_counter() < benchmark_end_time:
-        output = pipeline(inputs)
-        batch_timings.append(pipeline.timer_manager.latest)
+    start_time = time.perf_counter()
+    if scenario == "singlestream":
+        batch_times = singlestream_benchmark(pipeline, inputs, seconds_to_run)
+    elif scenario == "multistream":
+        batch_times = multistream_benchmark(pipeline, inputs, seconds_to_run, num_streams)
+    elif scenario == "elastic":
+        batch_times = multistream_benchmark(pipeline, inputs, seconds_to_run, num_streams)
+    else:
+        raise Exception(f"Unknown scenario '{scenario}'")
 
-    return batch_timings
+    if len(batch_times) == 0:
+        raise Exception(
+            "Generated no batch timings, try extending benchmark time with '--time'"
+        )
+    end_time = time.perf_counter()
+    total_run_time = end_time - start_time
+
+    return batch_times, total_run_time
+
+def calculate_statistics(batch_times_ms: List[float]) -> Dict:
+    percentiles = [25.0, 50.0, 75.0, 90.0, 95.0, 99.0, 99.9]
+    buckets = numpy.percentile(batch_times_ms, percentiles).tolist()
+    percentiles_dict = {
+        "{:2.1f}%".format(key): value for key, value in zip(percentiles, buckets)
+    }
 
+    benchmark_dict = {
+        "median": numpy.median(batch_times_ms),
+        "mean": numpy.mean(batch_times_ms),
+        "std": numpy.std(batch_times_ms),
+        **percentiles_dict,
+    }
+    return benchmark_dict
 
 def main():
     args = parse_args()
@@ -263,7 +301,7 @@ def main():
     print("Batch Size: {}".format(args.batch_size))
     print("Scenario: {}".format(args.scenario))
 
-    result = benchmark_pipeline(
+    batch_times, total_run_time = benchmark_pipeline(
         model_path=args.model_path,
         task=args.task_name,
         input_config = args.input_config,
@@ -274,35 +312,46 @@ def main():
         seconds_to_run=args.time,
         num_streams=args.num_streams,
         thread_pinning=args.thread_pinning,
-        engine=args.engine,
         quiet=args.quiet,
         export_path=args.export_path,
     )
 
-    # Results summary
-    batches_processed = len(result)
-    total_time = sum(st.times['total_inference'] for st in result)
-    print("Processed {} batches in {} seconds".format(batches_processed, total_time))
-    throughput = round(batches_processed / total_time, 4)
-    print("Throughput: {} batches/sec".format(throughput))
-    total_pre_process = sum(st.times['pre_process'] for st in result)
-    total_post_process = sum(st.times['post_process'] for st in result)
-    total_engine_forward = sum(st.times['engine_forward'] for st in result)
+    pre_process_times = [st.times['pre_process'] * 1000 for st in batch_times]
+    pre_stats = calculate_statistics(pre_process_times)
+    post_process_times = [st.times['post_process'] * 1000 for st in batch_times]
+    post_stats = calculate_statistics(post_process_times)
+    engine_forward_times = [st.times['engine_forward'] * 1000 for st in batch_times]
+    forward_stats = calculate_statistics(engine_forward_times)
 
-    avg_pre_process = round(total_pre_process / batches_processed * 1000, 4)
-    avg_post_process = round(total_post_process / batches_processed * 1000, 4)
-    avg_engine_forward = round(total_engine_forward / batches_processed * 1000, 4)
-
-    print("Average Pre-Process: {} ms".format(avg_pre_process))
-    print("Average Post-Process: {} ms".format(avg_post_process))
-    print("Average Engine Forward: {} ms".format(avg_engine_forward))
+    items_per_sec = (len(batch_times) * args.batch_size) / total_run_time
 
+    total_pre_process = sum(pre_process_times)
+    total_post_process = sum(post_process_times)
+    total_engine_forward = sum(engine_forward_times)
     total_time = total_pre_process + total_post_process + total_engine_forward
-    percent_pre = round(total_pre_process / total_time * 100, 2)
-    percent_post = round(total_post_process / total_time * 100, 2)
-    percent_forward = round(total_engine_forward / total_time * 100, 2)
-    print("{}% Pre-processing, {}% Post-processing, {}% Inference".format(percent_pre, percent_post, percent_forward))
+    percent_pre = total_pre_process / total_time * 100
+    percent_post = total_post_process / total_time * 100
+    percent_forward = total_engine_forward / total_time * 100
+
+    export_dict = {
+        "scenario": args.scenario,
+        "items_per_sec": items_per_sec,
+        "seconds_ran": total_run_time,
+        "iterations": len(batch_times),
+        "percent_pre": percent_pre,
+        "percent_post": percent_post,
+        "percent_forward": percent_forward,
+        "pre_stats": pre_stats,
+        "post_stats": post_stats,
+        "forward_stats": forward_stats
+    }
 
+    # Export results
+    export_path = args.export_path
+    if export_path:
+        _LOGGER.info("Saving benchmark results to JSON file at {}".format(export_path))
+        with open(export_path, "w") as out:
+            json.dump(export_dict, out, indent=2)
 
 if __name__ == "__main__":
     main()

From 58edf0580d6191ef28fecebb7ae5140bc73c3f32 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 25 Jul 2023 14:56:42 -0400
Subject: [PATCH 07/37] clean up printed output

---
 .../benchmark/benchmark_pipeline.py           | 60 ++++++++++++-------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 5440724066..2a630d914f 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -216,7 +216,6 @@ def benchmark_pipeline(
     model_path: str,
     task: str,
     input_config: str,
-    input_type: str = "dummy",
     batch_size: int = 1,
     num_cores: int = None,
     scenario: str = "sync",
@@ -224,7 +223,6 @@ def benchmark_pipeline(
     num_streams: int = None,
     thread_pinning: str = "core",
     quiet: bool = False,
-    export_path: str = None,
 ) -> Tuple[List[StagedTimer],float] :
     
     if quiet:
@@ -235,28 +233,34 @@ def benchmark_pipeline(
 
     decide_thread_pinning(thread_pinning, _LOGGER)
     scenario = parse_scenario(scenario.lower(), _LOGGER)
-    scheduler = parse_scheduler(scenario)
     num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
     
     config = parse_input_config(input_config)
+    input_type = config["data_type"]
     pipeline = Pipeline.create(task=task, model_path=model_path)
 
     input_data = []
-    if config['input_data_type'] == "string":
-        data_length = config['sequence_length']
-        for _ in range(batch_size):
-            rand_string = ''.join(random.choices(string.printable, k=data_length))
-            input_data.append(rand_string)
-        inputs = pipeline.input_schema(sequences=input_data)
-    elif config['input_data_type'] == "array":
-        image_shape = config["input_array_shape"]
-        dtype = config["input_array_dtype"]
-        for _ in range(batch_size):
-            if dtype == "uint8":
-                rand_array = numpy.random.randint(0,high=255, size=image_shape).astype(dtype)
-            rand_array = numpy.random.rand(*image_shape).astype(dtype)
-            input_data.append(rand_array)
-        inputs = pipeline.input_schema(images=input_data)
+    if input_type == "dummy":
+        if config['input_data_type'] == "string":
+            data_length = config['sequence_length']
+            for _ in range(batch_size):
+                rand_string = ''.join(random.choices(string.printable, k=data_length))
+                input_data.append(rand_string)
+            inputs = pipeline.input_schema(sequences=input_data)
+        elif config['input_data_type'] == "array":
+            image_shape = config["input_array_shape"]
+            dtype = config["input_array_dtype"]
+            for _ in range(batch_size):
+                if dtype == "uint8":
+                    rand_array = numpy.random.randint(0,high=255, size=image_shape).astype(dtype)
+                rand_array = numpy.random.rand(*image_shape).astype(dtype)
+                input_data.append(rand_array)
+            inputs = pipeline.input_schema(images=input_data)
+    elif input_type == "real":
+        raise Exception("Real input type not yet implemented")
+    else:
+        raise Exception(f"Unknown input type '{input_type}'")
+
 
     start_time = time.perf_counter()
     if scenario == "singlestream":
@@ -305,7 +309,6 @@ def main():
         model_path=args.model_path,
         task=args.task_name,
         input_config = args.input_config,
-        input_type = args.input_type,
         batch_size=args.batch_size,
         num_cores=args.num_cores,
         scenario=args.scenario,
@@ -313,7 +316,6 @@ def main():
         num_streams=args.num_streams,
         thread_pinning=args.thread_pinning,
         quiet=args.quiet,
-        export_path=args.export_path,
     )
 
     pre_process_times = [st.times['pre_process'] * 1000 for st in batch_times]
@@ -353,5 +355,23 @@ def main():
         with open(export_path, "w") as out:
             json.dump(export_dict, out, indent=2)
 
+    # Results summary
+    print("Original Model Path: {}".format(args.model_path))
+    print("Batch Size: {}".format(args.batch_size))
+    print("Scenario: {}".format(args.scenario))
+    print("Iterations: {}".format(int(export_dict["iterations"])))
+    print(
+        "Throughput (items/sec): {:.4f}".format(
+            export_dict["items_per_sec"]
+        )
+    )
+    print("Processing Time Breakdown: ")
+    print("     Pre-Processing: {:.2f}%".format(export_dict["percent_pre"]))
+    print("     Post-Processing: {:.2f}%".format(export_dict["percent_post"]))
+    print("     Forward Pass: {:.2f}%".format(export_dict["percent_forward"]))
+    print("Pre-Processing Latency Mean (ms/batch): {:.4f}".format(export_dict["pre_stats"]["mean"]))
+    print("Post-Processing Latency Mean (ms/batch): {:.4f}".format(export_dict["post_stats"]["mean"]))
+    print("Forward Pass Latency Mean (ms/batch): {:.4f}".format(export_dict["forward_stats"]["mean"]))
+
 if __name__ == "__main__":
     main()

From 75bda3ac6961aa9d2da2a97f337d8bd8d67d3f2d Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 25 Jul 2023 17:00:59 -0400
Subject: [PATCH 08/37] adding support for real data

---
 .../benchmark/benchmark_pipeline.py           | 112 ++++++++++++++----
 1 file changed, 92 insertions(+), 20 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 2a630d914f..16904f2d56 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -22,6 +22,7 @@
 import numpy
 import threading
 import queue
+import glob
 
 from deepsparse import __version__
 from deepsparse import Pipeline
@@ -193,7 +194,6 @@ def multistream_benchmark(
     max_time = time.perf_counter() + seconds_to_run
     threads = []
 
-    # Sara TODO: should these all be sharing the same pipeline?
     for thread in range(num_streams):
         threads.append(PipelineExecutorThread(pipeline, inputs, time_queue, max_time))
 
@@ -212,6 +212,84 @@ def parse_input_config(input_config_file: str) -> Dict[str, any]:
     config_file.close()
     return config
 
+def get_input_schema_type(pipeline: Pipeline) -> str:
+    input_schema_requirements = list(pipeline.input_schema.__annotations__.keys())
+    image_requirements = ["images"]
+    text_requirements = ["sequences", "text"]
+
+    if len(input_schema_requirements) == 1:
+        requirement = input_schema_requirements[0]
+        if requirement in image_requirements:
+            return "image"
+        elif requirement in text_requirements:
+            return "text"
+        
+    raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
+
+def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
+    input_data = []
+    if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
+        image_shape = config["input_image_shape"]
+    else:
+        image_shape = (240, 240, 3)
+        _LOGGER.warning("Using default image shape {}".format(image_shape))
+
+    for _ in range(batch_size):
+        rand_array = numpy.random.randint(0,high=255, size=image_shape).astype(numpy.uint8)
+        input_data.append(rand_array)
+
+    return input_data
+
+def load_image_data(config: Dict, batch_size: int) -> List[str]:
+    path_to_data = config["data_folder"]
+    recursive_search = config["recursive_search"]
+    files = []
+    for f in glob.glob(path_to_data + "/**", recursive=recursive_search):
+        if f.lower().endswith(".jpeg"):
+            files.append(f)
+    if len(files) < batch_size:
+        raise Exception("Not enough images found in {}".format(path_to_data))
+    input_data = random.sample(files, batch_size)
+
+    return input_data
+
+def generate_text_data(config: Dict, batch_size: int) -> List[str]:
+    input_data = []
+    if 'sequence_length' in config:
+        string_length = config['sequence_length']
+    else:
+        string_length = 100
+        _LOGGER.warning("Using default string length {}".format(string_length))
+    for _ in range(batch_size):
+        rand_string = ''.join(random.choices(string.printable, k=string_length))
+        input_data.append(rand_string)
+    
+    return input_data
+
+def load_text_data(config: Dict, batch_size: int) -> List[str]:
+    path_to_data = config["data_folder"]
+    recursive_search = config["recursive_search"]
+    files = []
+    for f in glob.glob(path_to_data + "/**", recursive=recursive_search):
+        if f.lower().endswith(".txt"):
+            files.append(f)
+    if len(files) < batch_size:
+        raise Exception("Not enough images found in {}".format(path_to_data))
+    input_files = random.sample(files, batch_size)
+    if "max_string_length" in config:
+        max_string_length = config["max_string_length"]
+    else:
+        max_string_length = -1
+        _LOGGER.warning("Using default max string length {}".format(max_string_length))
+    input_data = []
+    for f_path in input_files:
+        f = open(f_path)
+        text_data = f.read()
+        f.close()
+        input_data.append(text_data[:max_string_length])
+    print(input_data)
+    return input_data
+
 def benchmark_pipeline(
     model_path: str,
     task: str,
@@ -238,26 +316,22 @@ def benchmark_pipeline(
     config = parse_input_config(input_config)
     input_type = config["data_type"]
     pipeline = Pipeline.create(task=task, model_path=model_path)
+    input_schema_requirement = get_input_schema_type(pipeline)
 
-    input_data = []
     if input_type == "dummy":
-        if config['input_data_type'] == "string":
-            data_length = config['sequence_length']
-            for _ in range(batch_size):
-                rand_string = ''.join(random.choices(string.printable, k=data_length))
-                input_data.append(rand_string)
-            inputs = pipeline.input_schema(sequences=input_data)
-        elif config['input_data_type'] == "array":
-            image_shape = config["input_array_shape"]
-            dtype = config["input_array_dtype"]
-            for _ in range(batch_size):
-                if dtype == "uint8":
-                    rand_array = numpy.random.randint(0,high=255, size=image_shape).astype(dtype)
-                rand_array = numpy.random.rand(*image_shape).astype(dtype)
-                input_data.append(rand_array)
+        if input_schema_requirement == "image":
+            input_data = generate_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data)
+        elif input_schema_requirement == "text":
+            input_data = generate_text_data(config, batch_size)
+            inputs = pipeline.input_schema(sequences=input_data)
     elif input_type == "real":
-        raise Exception("Real input type not yet implemented")
+        if input_schema_requirement == "image":
+            input_data = load_image_data(config, batch_size)
+            inputs = pipeline.input_schema(images=input_data)
+        elif input_schema_requirement == "text":
+            input_data = load_text_data(config, batch_size)
+            inputs = pipeline.input_schema(sequences=input_data)
     else:
         raise Exception(f"Unknown input type '{input_type}'")
 
@@ -273,9 +347,7 @@ def benchmark_pipeline(
         raise Exception(f"Unknown scenario '{scenario}'")
 
     if len(batch_times) == 0:
-        raise Exception(
-            "Generated no batch timings, try extending benchmark time with '--time'"
-        )
+        raise Exception("Generated no batch timings, try extending benchmark time with '--time'")
     end_time = time.perf_counter()
     total_run_time = end_time - start_time
 

From b751e750189cde1ea365065d9ad5c1a16a64489a Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 27 Jul 2023 12:33:05 -0400
Subject: [PATCH 09/37] support for additional pipelines

---
 .../benchmark/benchmark_pipeline.py           | 134 ++++++++++++------
 1 file changed, 90 insertions(+), 44 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 16904f2d56..ac2ec44607 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -215,15 +215,19 @@ def parse_input_config(input_config_file: str) -> Dict[str, any]:
 def get_input_schema_type(pipeline: Pipeline) -> str:
     input_schema_requirements = list(pipeline.input_schema.__annotations__.keys())
     image_requirements = ["images"]
-    text_requirements = ["sequences", "text"]
-
-    if len(input_schema_requirements) == 1:
-        requirement = input_schema_requirements[0]
-        if requirement in image_requirements:
-            return "image"
-        elif requirement in text_requirements:
-            return "text"
-        
+    basic_text_requirements = ["sequences"]
+    question_requirements = ["question", "context", "id"]
+    text_generation_requirements = ["sequences", "return_logits", "session_id", "fixed_sequences_length"]
+
+    if input_schema_requirements == image_requirements or "YOLO" in pipeline.input_schema.__name__:
+        return "image"
+    elif input_schema_requirements == basic_text_requirements:
+        return "text"
+    elif input_schema_requirements == question_requirements:
+        return "question"
+    elif input_schema_requirements == text_generation_requirements:
+        return "text_generation"
+
     raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
 
 def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
@@ -255,14 +259,14 @@ def load_image_data(config: Dict, batch_size: int) -> List[str]:
 
 def generate_text_data(config: Dict, batch_size: int) -> List[str]:
     input_data = []
-    if 'sequence_length' in config:
-        string_length = config['sequence_length']
+    if 'gen_sequence_length' in config:
+        string_length = config['gen_sequence_length']
     else:
         string_length = 100
         _LOGGER.warning("Using default string length {}".format(string_length))
     for _ in range(batch_size):
-        rand_string = ''.join(random.choices(string.printable, k=string_length))
-        input_data.append(rand_string)
+        rand_sentence = generate_sentence(string_length)
+        input_data.append(rand_sentence)
     
     return input_data
 
@@ -287,9 +291,38 @@ def load_text_data(config: Dict, batch_size: int) -> List[str]:
         text_data = f.read()
         f.close()
         input_data.append(text_data[:max_string_length])
-    print(input_data)
     return input_data
 
+def generate_sentence(string_length: int, avg_word_length: int = 5):
+    random_chars = ''.join(random.choices(string.ascii_letters, k=string_length))
+    space_locations = random.sample(range(string_length), int(string_length / avg_word_length))
+    random_chars = list(random_chars)
+    for loc in space_locations:
+        random_chars[loc] = ' '
+    return ''.join(random_chars)
+
+def generate_question_data(config: Dict) -> Tuple[str, str]:
+    if 'gen_sequence_length' in config:
+        string_length = config['gen_sequence_length']
+    else:
+        string_length = 100
+        _LOGGER.warning("Using default string length {}".format(string_length))
+    question = generate_sentence(string_length)
+    context = generate_sentence(string_length)
+    return (question, context)
+
+def load_question_data(config: Dict) -> Tuple[str, str]:
+    path_to_questions = config["question_file"]
+    path_to_context = config["context_file"]
+
+    f_question = open(path_to_questions)
+    f_context = open(path_to_context)
+    question = f_question.read()
+    context = f_context.read()
+    f_question.close()
+    f_context.close()
+    return question, context
+
 def benchmark_pipeline(
     model_path: str,
     task: str,
@@ -325,6 +358,14 @@ def benchmark_pipeline(
         elif input_schema_requirement == "text":
             input_data = generate_text_data(config, batch_size)
             inputs = pipeline.input_schema(sequences=input_data)
+        elif input_schema_requirement == "question":
+            _LOGGER.warn("Only batch size of 1 supported for Question Answering Pipeline")
+            question, context = generate_question_data(config)
+            inputs = pipeline.input_schema(question=question, context=context)
+        elif input_schema_requirement == "text_generation":
+            seqs = generate_text_data(config, batch_size)
+            fix_len = config["fix_sequence_length"]
+            inputs = pipeline.input_schema(sequences=seqs, return_logits=False, session_id=None, fixed_sequences_length=fix_len)
     elif input_type == "real":
         if input_schema_requirement == "image":
             input_data = load_image_data(config, batch_size)
@@ -332,6 +373,14 @@ def benchmark_pipeline(
         elif input_schema_requirement == "text":
             input_data = load_text_data(config, batch_size)
             inputs = pipeline.input_schema(sequences=input_data)
+        elif input_schema_requirement == "question":
+            _LOGGER.warn("Only batch size of 1 supported for Question Answering Pipeline")
+            question, context = load_question_data(config)
+            inputs = pipeline.input_schema(question=question, context=context)
+        elif input_schema_requirement == "text_generation":
+            seqs = load_text_data(config, batch_size)
+            fix_len = config["fix_sequence_length"]
+            inputs = pipeline.input_schema(sequences=seqs, return_logits=False, session_id=None, fixed_sequences_length=fix_len)
     else:
         raise Exception(f"Unknown input type '{input_type}'")
 
@@ -346,21 +395,22 @@ def benchmark_pipeline(
     else:
         raise Exception(f"Unknown scenario '{scenario}'")
 
-    if len(batch_times) == 0:
-        raise Exception("Generated no batch timings, try extending benchmark time with '--time'")
     end_time = time.perf_counter()
     total_run_time = end_time - start_time
+    if len(batch_times) == 0:
+        raise Exception("Generated no batch timings, try extending benchmark time with '--time'")
 
     return batch_times, total_run_time
 
-def calculate_statistics(batch_times_ms: List[float]) -> Dict:
+def calculate_statistics(batch_times_ms: List[float], total_run_time_ms: float) -> Dict:
     percentiles = [25.0, 50.0, 75.0, 90.0, 95.0, 99.0, 99.9]
     buckets = numpy.percentile(batch_times_ms, percentiles).tolist()
     percentiles_dict = {
         "{:2.1f}%".format(key): value for key, value in zip(percentiles, buckets)
     }
-
+    
     benchmark_dict = {
+        "total_percentage": sum(batch_times_ms) / total_run_time_ms * 100,
         "median": numpy.median(batch_times_ms),
         "mean": numpy.mean(batch_times_ms),
         "std": numpy.std(batch_times_ms),
@@ -368,6 +418,18 @@ def calculate_statistics(batch_times_ms: List[float]) -> Dict:
     }
     return benchmark_dict
 
+def calculate_section_stats(batch_times: List[StagedTimer], total_run_time: float) -> Dict[str, Dict]:
+    compute_sections = batch_times[0].stages
+    total_run_time_ms = total_run_time * 1000
+
+    sections = {}
+    for section in compute_sections:
+        section_times = [st.times[section] * 1000 for st in batch_times]
+        sections[section] = calculate_statistics(section_times, total_run_time_ms)
+
+    return sections
+
+
 def main():
     args = parse_args()
 
@@ -390,34 +452,16 @@ def main():
         quiet=args.quiet,
     )
 
-    pre_process_times = [st.times['pre_process'] * 1000 for st in batch_times]
-    pre_stats = calculate_statistics(pre_process_times)
-    post_process_times = [st.times['post_process'] * 1000 for st in batch_times]
-    post_stats = calculate_statistics(post_process_times)
-    engine_forward_times = [st.times['engine_forward'] * 1000 for st in batch_times]
-    forward_stats = calculate_statistics(engine_forward_times)
-
+    section_stats = calculate_section_stats(batch_times, total_run_time)
     items_per_sec = (len(batch_times) * args.batch_size) / total_run_time
 
-    total_pre_process = sum(pre_process_times)
-    total_post_process = sum(post_process_times)
-    total_engine_forward = sum(engine_forward_times)
-    total_time = total_pre_process + total_post_process + total_engine_forward
-    percent_pre = total_pre_process / total_time * 100
-    percent_post = total_post_process / total_time * 100
-    percent_forward = total_engine_forward / total_time * 100
 
     export_dict = {
         "scenario": args.scenario,
         "items_per_sec": items_per_sec,
         "seconds_ran": total_run_time,
         "iterations": len(batch_times),
-        "percent_pre": percent_pre,
-        "percent_post": percent_post,
-        "percent_forward": percent_forward,
-        "pre_stats": pre_stats,
-        "post_stats": post_stats,
-        "forward_stats": forward_stats
+        "compute_sections": section_stats
     }
 
     # Export results
@@ -437,13 +481,15 @@ def main():
             export_dict["items_per_sec"]
         )
     )
+
     print("Processing Time Breakdown: ")
-    print("     Pre-Processing: {:.2f}%".format(export_dict["percent_pre"]))
-    print("     Post-Processing: {:.2f}%".format(export_dict["percent_post"]))
-    print("     Forward Pass: {:.2f}%".format(export_dict["percent_forward"]))
-    print("Pre-Processing Latency Mean (ms/batch): {:.4f}".format(export_dict["pre_stats"]["mean"]))
-    print("Post-Processing Latency Mean (ms/batch): {:.4f}".format(export_dict["post_stats"]["mean"]))
-    print("Forward Pass Latency Mean (ms/batch): {:.4f}".format(export_dict["forward_stats"]["mean"]))
+    compute_sections = batch_times[0].stages
+    for section in compute_sections:
+        print("     {}: {:.2f}%".format(section, section_stats[section]["total_percentage"]))
+    
+    print("Mean Latency Breakdown (ms/batch): ")
+    for section in compute_sections:
+        print("     {}: {:.4f}".format(section, section_stats[section]["mean"]))
 
 if __name__ == "__main__":
     main()

From 76a5af942f1233a8c7c307725c06f50a1ef24e31 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 27 Jul 2023 15:50:54 -0400
Subject: [PATCH 10/37] expanding input schemas, allowing for kwargs

---
 .../benchmark/benchmark_pipeline.py           | 115 +++++++++---------
 1 file changed, 59 insertions(+), 56 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index ac2ec44607..a62fa09688 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -45,6 +45,9 @@
 DEEPSPARSE_ENGINE = "deepsparse"
 ORT_ENGINE = "onnxruntime"
 
+DEFAULT_STRING_LENGTH = 50
+DEFAULT_IMAGE_SHAPE = (240, 240, 3)
+
 
 def parse_args():
     parser = argparse.ArgumentParser(
@@ -212,22 +215,40 @@ def parse_input_config(input_config_file: str) -> Dict[str, any]:
     config_file.close()
     return config
 
+def get_files_with_endings(folder:str, num_files: int, recursive: bool, file_endings: List[str]) -> List[str]:
+    files = []
+    for f in glob.glob(folder + "/**", recursivere=recursive):
+        if f.lower().endswith(file_endings):
+            files.append(f)
+    if len(files) < num_files:
+        raise Exception("Not enough images found in {}".format(folder))
+    return random.sample(files, num_files)
+
+def generate_sentence(string_length: int, avg_word_length: int = 5):
+    random_chars = ''.join(random.choices(string.ascii_letters, k=string_length))
+    space_locations = random.sample(range(string_length), int(string_length / avg_word_length))
+    random_chars = list(random_chars)
+    for loc in space_locations:
+        random_chars[loc] = ' '
+    return ''.join(random_chars)
+
 def get_input_schema_type(pipeline: Pipeline) -> str:
-    input_schema_requirements = list(pipeline.input_schema.__annotations__.keys())
-    image_requirements = ["images"]
-    basic_text_requirements = ["sequences"]
-    question_requirements = ["question", "context", "id"]
-    text_generation_requirements = ["sequences", "return_logits", "session_id", "fixed_sequences_length"]
+    input_schema_requirements = list(pipeline.input_schema.__fields__.keys())
+    input_schema_fields = pipeline.input_schema.__fields__
 
-    if input_schema_requirements == image_requirements or "YOLO" in pipeline.input_schema.__name__:
+    if "images" in input_schema_requirements:
         return "image"
-    elif input_schema_requirements == basic_text_requirements:
-        return "text"
-    elif input_schema_requirements == question_requirements:
+    if "sequences" in input_schema_requirements:
+        sequence_types = [f.outer_type_ for f in input_schema_fields['sequences'].sub_fields]
+        if List[str] in sequence_types:
+            return "text_sequence"
+    elif "inputs" in input_schema_requirements:
+        sequence_types = [f.outer_type_ for f in input_schema_fields['inputs'].sub_fields]
+        if List[str] in sequence_types:
+            return "text_inputs"
+    elif "question" in input_schema_requirements:
         return "question"
-    elif input_schema_requirements == text_generation_requirements:
-        return "text_generation"
-
+    
     raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
 
 def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
@@ -235,7 +256,7 @@ def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
     if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
         image_shape = config["input_image_shape"]
     else:
-        image_shape = (240, 240, 3)
+        image_shape = DEFAULT_IMAGE_SHAPE
         _LOGGER.warning("Using default image shape {}".format(image_shape))
 
     for _ in range(batch_size):
@@ -247,22 +268,14 @@ def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
 def load_image_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
-    files = []
-    for f in glob.glob(path_to_data + "/**", recursive=recursive_search):
-        if f.lower().endswith(".jpeg"):
-            files.append(f)
-    if len(files) < batch_size:
-        raise Exception("Not enough images found in {}".format(path_to_data))
-    input_data = random.sample(files, batch_size)
-
-    return input_data
+    return get_files_with_endings(path_to_data, batch_size, recursive_search, [".jpg", ".jpeg", ".gif"])
 
 def generate_text_data(config: Dict, batch_size: int) -> List[str]:
     input_data = []
     if 'gen_sequence_length' in config:
         string_length = config['gen_sequence_length']
     else:
-        string_length = 100
+        string_length = DEFAULT_STRING_LENGTH
         _LOGGER.warning("Using default string length {}".format(string_length))
     for _ in range(batch_size):
         rand_sentence = generate_sentence(string_length)
@@ -273,13 +286,7 @@ def generate_text_data(config: Dict, batch_size: int) -> List[str]:
 def load_text_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
-    files = []
-    for f in glob.glob(path_to_data + "/**", recursive=recursive_search):
-        if f.lower().endswith(".txt"):
-            files.append(f)
-    if len(files) < batch_size:
-        raise Exception("Not enough images found in {}".format(path_to_data))
-    input_files = random.sample(files, batch_size)
+    input_files = get_files_with_endings(path_to_data, batch_size, recursive_search, [".txt"])
     if "max_string_length" in config:
         max_string_length = config["max_string_length"]
     else:
@@ -293,19 +300,11 @@ def load_text_data(config: Dict, batch_size: int) -> List[str]:
         input_data.append(text_data[:max_string_length])
     return input_data
 
-def generate_sentence(string_length: int, avg_word_length: int = 5):
-    random_chars = ''.join(random.choices(string.ascii_letters, k=string_length))
-    space_locations = random.sample(range(string_length), int(string_length / avg_word_length))
-    random_chars = list(random_chars)
-    for loc in space_locations:
-        random_chars[loc] = ' '
-    return ''.join(random_chars)
-
 def generate_question_data(config: Dict) -> Tuple[str, str]:
     if 'gen_sequence_length' in config:
         string_length = config['gen_sequence_length']
     else:
-        string_length = 100
+        string_length = DEFAULT_STRING_LENGTH
         _LOGGER.warning("Using default string length {}".format(string_length))
     question = generate_sentence(string_length)
     context = generate_sentence(string_length)
@@ -348,39 +347,43 @@ def benchmark_pipeline(
     
     config = parse_input_config(input_config)
     input_type = config["data_type"]
-    pipeline = Pipeline.create(task=task, model_path=model_path)
+    kwargs = {}
+    if "pipeline_kwargs" in config:
+        kwargs = config["pipeline_kwargs"]
+    pipeline = Pipeline.create(task=task, model_path=model_path, **kwargs)
     input_schema_requirement = get_input_schema_type(pipeline)
+    kwargs = {}
+    if "input_schema_kwargs" in config:
+        kwargs = config["input_schema_kwargs"]
 
     if input_type == "dummy":
         if input_schema_requirement == "image":
             input_data = generate_image_data(config, batch_size)
-            inputs = pipeline.input_schema(images=input_data)
-        elif input_schema_requirement == "text":
+            inputs = pipeline.input_schema(images=input_data, **kwargs)
+        elif input_schema_requirement == "text_sequence":
+            input_data = generate_text_data(config, batch_size)
+            inputs = pipeline.input_schema(sequences=input_data, **kwargs)
+        elif input_schema_requirement == "text_inputs":
             input_data = generate_text_data(config, batch_size)
-            inputs = pipeline.input_schema(sequences=input_data)
+            inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == "question":
             _LOGGER.warn("Only batch size of 1 supported for Question Answering Pipeline")
             question, context = generate_question_data(config)
-            inputs = pipeline.input_schema(question=question, context=context)
-        elif input_schema_requirement == "text_generation":
-            seqs = generate_text_data(config, batch_size)
-            fix_len = config["fix_sequence_length"]
-            inputs = pipeline.input_schema(sequences=seqs, return_logits=False, session_id=None, fixed_sequences_length=fix_len)
+            inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     elif input_type == "real":
         if input_schema_requirement == "image":
             input_data = load_image_data(config, batch_size)
-            inputs = pipeline.input_schema(images=input_data)
-        elif input_schema_requirement == "text":
+            inputs = pipeline.input_schema(images=input_data, **kwargs)
+        elif input_schema_requirement == "text_sequence":
+            input_data = load_text_data(config, batch_size)
+            inputs = pipeline.input_schema(sequences=input_data, **kwargs)
+        elif input_schema_requirement == "text_inputs":
             input_data = load_text_data(config, batch_size)
-            inputs = pipeline.input_schema(sequences=input_data)
+            inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == "question":
             _LOGGER.warn("Only batch size of 1 supported for Question Answering Pipeline")
             question, context = load_question_data(config)
-            inputs = pipeline.input_schema(question=question, context=context)
-        elif input_schema_requirement == "text_generation":
-            seqs = load_text_data(config, batch_size)
-            fix_len = config["fix_sequence_length"]
-            inputs = pipeline.input_schema(sequences=seqs, return_logits=False, session_id=None, fixed_sequences_length=fix_len)
+            inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     else:
         raise Exception(f"Unknown input type '{input_type}'")
 

From 6cb6bef4a4e31c60e4258fb5e4725d155f3b16ca Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 28 Jul 2023 11:00:30 -0400
Subject: [PATCH 11/37] README, quality, additional args

---
 src/deepsparse/benchmark/README.md            |  68 ++++++
 src/deepsparse/benchmark/benchmark_model.py   |  13 +-
 .../benchmark/benchmark_pipeline.py           | 206 +++++++++++-------
 src/deepsparse/benchmark/helpers.py           |  25 ++-
 4 files changed, 229 insertions(+), 83 deletions(-)

diff --git a/src/deepsparse/benchmark/README.md b/src/deepsparse/benchmark/README.md
index c67133744e..6bcb3477ec 100644
--- a/src/deepsparse/benchmark/README.md
+++ b/src/deepsparse/benchmark/README.md
@@ -186,4 +186,72 @@ Latency Mean (ms/batch): 16.0732
 Latency Median (ms/batch): 15.7850
 Latency Std (ms/batch): 1.0427
 Iterations: 622
+```
+
+## 📜 Benchmarking Pipelines
+Expanding on the model benchmarking script, the pipeline benchmarker is a tool for benchmarking end-to-end inference, including pre and post processing. The script can generate fake input data based on the pipeline's input schema, or load it from a local folder. The pipeline then runs pre-processing, engine inference and post-processing. Benchmarking results are reported by section, useful for identifying bottlenecks. 
+
+### Usage 
+Input arguments are the same as the Engine benchmarker, but with two addtions:
+
+```
+positional arguments:
+  task_name             Type of pipeline to run(i.e "text_generation")
+
+optional arguments:
+  -c INPUT_CONFIG, --input_config INPUT_CONFIG
+                        JSON file containing schema for input data
+```
+
+The `input_config` argument is a path to a json file specifying details on the input schema to the pipeline, detailed below.
+
+### Configuring Pipeline Inputs
+
+Inputs to the pipeline are configured through a json config file. The `data_type` field should be set to `"dummy"` if passing randomly generated data through the pipeline, and `"real"` if passing in data from files.
+
+#### Dummy Input Configuration
+An example dummy input configuration is shown below.
+* `gen_sequence_length`: number of characters to generate for pipelines that take text input
+* `input_image_shape`: configures image size for pipelines that take image input, must be 3 dimmensional with channel as the last dimmension
+
+```json
+{
+    "data_type": "dummy",
+    "gen_sequence_length": 100,
+    "input_image_shape": [500,500,3],
+    "pipeline_kwargs": {},
+    "input_schema_kwargs": {}
+} 
+```
+
+#### Real Input Configuration
+An example real input configuration is shown below.
+* `data_folder`: path to local folder of input data, should contain text or image files
+* `recursive_search`: whether to recursively search through `data_folder` for files
+* `max_string_length`: maximum characters to read from each file containing text data, -1 for no max length
+
+```json
+{
+    "data_type": "real",
+    "data_folder": "/home/sadkins/imagenette2-320/",
+    "recursive_search": true,
+    "max_string_length": -1,
+    "pipeline_kwargs": {},
+    "input_schema_kwargs": {}
+} 
+```
+
+#### Keyword Arguments
+Additional arguments to the pipeline or input_schema can be added to the `pipeline_kwargs` and `input_schema_kwargs` fields respectively. For instance, to pass class_names to a YOLO pipeline and conf_thres to the input schema
+```json
+{
+    "data_type": "dummy",
+    "input_image_shape": [500,500,3],
+    "pipeline_kwargs": {
+        "class_names": ["classA", "classB"]
+    },
+    "input_schema_kwargs": {
+        "conf_thres": 0.7
+    }
+} 
 ```
\ No newline at end of file
diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index ea280cc809..04fcdb8c7a 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -95,10 +95,15 @@
 import importlib
 import json
 import logging
-import os
 from typing import Dict
 
 from deepsparse import __version__, compile_model
+from deepsparse.benchmark.helpers import (
+    decide_thread_pinning,
+    parse_num_streams,
+    parse_scenario,
+    parse_scheduler,
+)
 from deepsparse.benchmark.ort_engine import ORTEngine
 from deepsparse.benchmark.stream_benchmark import model_stream_benchmark
 from deepsparse.cpu import cpu_architecture
@@ -109,12 +114,6 @@
     override_onnx_input_shapes,
     parse_input_shapes,
 )
-from deepsparse.benchmark.helpers import (
-    decide_thread_pinning,
-    parse_scheduler,
-    parse_scenario,
-    parse_num_streams
-)
 
 
 __all__ = ["benchmark_model"]
diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index a62fa09688..a791b11853 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -13,28 +13,28 @@
 # limitations under the License.
 
 import argparse
+import glob
 import json
-import string
 import logging
+import queue
 import random
-from typing import Dict, List, Tuple
+import string
+import threading
 import time
+from typing import Dict, List, Tuple
+
 import numpy
-import threading
-import queue
-import glob
 
-from deepsparse import __version__
 from deepsparse import Pipeline
-from deepsparse.cpu import cpu_architecture
-from deepsparse.log import set_logging_level
-from deepsparse.utils.timer import StagedTimer
 from deepsparse.benchmark.helpers import (
     decide_thread_pinning,
-    parse_scheduler,
+    parse_num_streams,
     parse_scenario,
-    parse_num_streams
+    parse_scheduler,
 )
+from deepsparse.cpu import cpu_architecture
+from deepsparse.log import set_logging_level
+from deepsparse.utils.timer import StagedTimer
 
 
 __all__ = ["benchmark_pipeline"]
@@ -50,14 +50,8 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Benchmark DeepSparse Pipelines"
-    )
-    parser.add_argument(
-        "task_name",
-        type=str,
-        help="Type of pipeline to run"
-    )
+    parser = argparse.ArgumentParser(description="Benchmark DeepSparse Pipelines")
+    parser.add_argument("task_name", type=str, help="Type of pipeline to run")
     parser.add_argument(
         "model_path",
         type=str,
@@ -68,15 +62,7 @@ def parse_args():
         "--input_config",
         type=str,
         default="config.json",
-        help="JSON file containing schema for input data"
-    )
-    parser.add_argument(
-        "-i",
-        "--input_type",
-        type=str,
-        default="dummy",
-        choices=["dummy", "real"],
-        help="Type of input data to use, real or randomly generated"
+        help="JSON file containing schema for input data",
     )
     parser.add_argument(
         "-b",
@@ -115,6 +101,16 @@ def parse_args():
         default=10,
         help="The number of seconds the benchmark will run. Default is 10 seconds.",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup_time",
+        type=int,
+        default=2,
+        help=(
+            "The number of seconds the benchmark will warmup before running."
+            "Default is 2 seconds."
+        ),
+    )
     parser.add_argument(
         "-nstreams",
         "--num_streams",
@@ -137,6 +133,19 @@ def parse_args():
             "threads to cores on sockets ('numa'), or disable ('none')"
         ),
     )
+    parser.add_argument(
+        "-e",
+        "--engine",
+        type=str,
+        default=DEEPSPARSE_ENGINE,
+        help=(
+            "Inference engine backend to run eval on. Choices are 'deepsparse', "
+            "'onnxruntime'. Default is 'deepsparse'. Can also specify a user "
+            "defined engine class by giving the script and class name in the "
+            "following format <path to python script>:<Engine Class name>. This "
+            "engine class will be dynamically imported during runtime"
+        ),
+    )
     parser.add_argument(
         "-q",
         "--quiet",
@@ -154,13 +163,14 @@ def parse_args():
 
     return parser.parse_args()
 
+
 class PipelineExecutorThread(threading.Thread):
     def __init__(
         self,
         pipeline: Pipeline,
         inputs: List[any],
         time_queue: queue.Queue,
-        max_time: float
+        max_time: float,
     ):
         super(PipelineExecutorThread, self).__init__()
         self._pipeline = pipeline
@@ -170,23 +180,22 @@ def __init__(
 
     def run(self):
         while time.perf_counter() < self._max_time:
-            output = self._pipeline(self._inputs)
+            _ = self._pipeline(self._inputs)
             self._time_queue.put(self._pipeline.timer_manager.latest)
 
 
 def singlestream_benchmark(
-    pipeline: Pipeline,
-    inputs: List[any],
-    seconds_to_run: float
+    pipeline: Pipeline, inputs: List[any], seconds_to_run: float
 ) -> List[StagedTimer]:
     benchmark_end_time = time.perf_counter() + seconds_to_run
     batch_timings = []
     while time.perf_counter() < benchmark_end_time:
-        output = pipeline(inputs)
+        _ = pipeline(inputs)
         batch_timings.append(pipeline.timer_manager.latest)
 
     return batch_timings
 
+
 def multistream_benchmark(
     pipeline: Pipeline,
     inputs: List[any],
@@ -215,7 +224,10 @@ def parse_input_config(input_config_file: str) -> Dict[str, any]:
     config_file.close()
     return config
 
-def get_files_with_endings(folder:str, num_files: int, recursive: bool, file_endings: List[str]) -> List[str]:
+
+def get_files_with_endings(
+    folder: str, num_files: int, recursive: bool, file_endings: List[str]
+) -> List[str]:
     files = []
     for f in glob.glob(folder + "/**", recursivere=recursive):
         if f.lower().endswith(file_endings):
@@ -224,13 +236,17 @@ def get_files_with_endings(folder:str, num_files: int, recursive: bool, file_end
         raise Exception("Not enough images found in {}".format(folder))
     return random.sample(files, num_files)
 
+
 def generate_sentence(string_length: int, avg_word_length: int = 5):
-    random_chars = ''.join(random.choices(string.ascii_letters, k=string_length))
-    space_locations = random.sample(range(string_length), int(string_length / avg_word_length))
+    random_chars = "".join(random.choices(string.ascii_letters, k=string_length))
+    space_locations = random.sample(
+        range(string_length), int(string_length / avg_word_length)
+    )
     random_chars = list(random_chars)
     for loc in space_locations:
-        random_chars[loc] = ' '
-    return ''.join(random_chars)
+        random_chars[loc] = " "
+    return "".join(random_chars)
+
 
 def get_input_schema_type(pipeline: Pipeline) -> str:
     input_schema_requirements = list(pipeline.input_schema.__fields__.keys())
@@ -239,18 +255,23 @@ def get_input_schema_type(pipeline: Pipeline) -> str:
     if "images" in input_schema_requirements:
         return "image"
     if "sequences" in input_schema_requirements:
-        sequence_types = [f.outer_type_ for f in input_schema_fields['sequences'].sub_fields]
+        sequence_types = [
+            f.outer_type_ for f in input_schema_fields["sequences"].sub_fields
+        ]
         if List[str] in sequence_types:
             return "text_sequence"
     elif "inputs" in input_schema_requirements:
-        sequence_types = [f.outer_type_ for f in input_schema_fields['inputs'].sub_fields]
+        sequence_types = [
+            f.outer_type_ for f in input_schema_fields["inputs"].sub_fields
+        ]
         if List[str] in sequence_types:
             return "text_inputs"
     elif "question" in input_schema_requirements:
         return "question"
-    
+
     raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
 
+
 def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
     input_data = []
     if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
@@ -260,33 +281,42 @@ def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
         _LOGGER.warning("Using default image shape {}".format(image_shape))
 
     for _ in range(batch_size):
-        rand_array = numpy.random.randint(0,high=255, size=image_shape).astype(numpy.uint8)
+        rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
+            numpy.uint8
+        )
         input_data.append(rand_array)
 
     return input_data
 
+
 def load_image_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
-    return get_files_with_endings(path_to_data, batch_size, recursive_search, [".jpg", ".jpeg", ".gif"])
+    return get_files_with_endings(
+        path_to_data, batch_size, recursive_search, [".jpg", ".jpeg", ".gif"]
+    )
+
 
 def generate_text_data(config: Dict, batch_size: int) -> List[str]:
     input_data = []
-    if 'gen_sequence_length' in config:
-        string_length = config['gen_sequence_length']
+    if "gen_sequence_length" in config:
+        string_length = config["gen_sequence_length"]
     else:
         string_length = DEFAULT_STRING_LENGTH
         _LOGGER.warning("Using default string length {}".format(string_length))
     for _ in range(batch_size):
         rand_sentence = generate_sentence(string_length)
         input_data.append(rand_sentence)
-    
+
     return input_data
 
+
 def load_text_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
-    input_files = get_files_with_endings(path_to_data, batch_size, recursive_search, [".txt"])
+    input_files = get_files_with_endings(
+        path_to_data, batch_size, recursive_search, [".txt"]
+    )
     if "max_string_length" in config:
         max_string_length = config["max_string_length"]
     else:
@@ -300,9 +330,10 @@ def load_text_data(config: Dict, batch_size: int) -> List[str]:
         input_data.append(text_data[:max_string_length])
     return input_data
 
+
 def generate_question_data(config: Dict) -> Tuple[str, str]:
-    if 'gen_sequence_length' in config:
-        string_length = config['gen_sequence_length']
+    if "gen_sequence_length" in config:
+        string_length = config["gen_sequence_length"]
     else:
         string_length = DEFAULT_STRING_LENGTH
         _LOGGER.warning("Using default string length {}".format(string_length))
@@ -310,6 +341,7 @@ def generate_question_data(config: Dict) -> Tuple[str, str]:
     context = generate_sentence(string_length)
     return (question, context)
 
+
 def load_question_data(config: Dict) -> Tuple[str, str]:
     path_to_questions = config["question_file"]
     path_to_context = config["context_file"]
@@ -322,6 +354,7 @@ def load_question_data(config: Dict) -> Tuple[str, str]:
     f_context.close()
     return question, context
 
+
 def benchmark_pipeline(
     model_path: str,
     task: str,
@@ -330,11 +363,13 @@ def benchmark_pipeline(
     num_cores: int = None,
     scenario: str = "sync",
     seconds_to_run: int = 10,
+    warmup_time: int = 2,
     num_streams: int = None,
     thread_pinning: str = "core",
+    engine: str = DEEPSPARSE_ENGINE,
     quiet: bool = False,
-) -> Tuple[List[StagedTimer],float] :
-    
+) -> Tuple[List[StagedTimer], float]:
+
     if quiet:
         set_logging_level(logging.WARN)
 
@@ -343,14 +378,22 @@ def benchmark_pipeline(
 
     decide_thread_pinning(thread_pinning, _LOGGER)
     scenario = parse_scenario(scenario.lower(), _LOGGER)
+    scheduler = parse_scheduler(scenario)
     num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
-    
+
     config = parse_input_config(input_config)
     input_type = config["data_type"]
     kwargs = {}
     if "pipeline_kwargs" in config:
         kwargs = config["pipeline_kwargs"]
-    pipeline = Pipeline.create(task=task, model_path=model_path, **kwargs)
+    pipeline = Pipeline.create(
+        task=task,
+        model_path=model_path,
+        engine_type=engine,
+        scheduler=scheduler,
+        num_cores=num_cores,
+        **kwargs,
+    )
     input_schema_requirement = get_input_schema_type(pipeline)
     kwargs = {}
     if "input_schema_kwargs" in config:
@@ -367,7 +410,9 @@ def benchmark_pipeline(
             input_data = generate_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == "question":
-            _LOGGER.warn("Only batch size of 1 supported for Question Answering Pipeline")
+            _LOGGER.warn(
+                "Only batch size of 1 supported for Question Answering Pipeline"
+            )
             question, context = generate_question_data(config)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     elif input_type == "real":
@@ -381,37 +426,48 @@ def benchmark_pipeline(
             input_data = load_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == "question":
-            _LOGGER.warn("Only batch size of 1 supported for Question Answering Pipeline")
+            _LOGGER.warn(
+                "Only batch size of 1 supported for Question Answering Pipeline"
+            )
             question, context = load_question_data(config)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     else:
         raise Exception(f"Unknown input type '{input_type}'")
 
-
     start_time = time.perf_counter()
     if scenario == "singlestream":
+        singlestream_benchmark(pipeline, inputs, warmup_time)
         batch_times = singlestream_benchmark(pipeline, inputs, seconds_to_run)
     elif scenario == "multistream":
-        batch_times = multistream_benchmark(pipeline, inputs, seconds_to_run, num_streams)
+        multistream_benchmark(pipeline, inputs, warmup_time, num_streams)
+        batch_times = multistream_benchmark(
+            pipeline, inputs, seconds_to_run, num_streams
+        )
     elif scenario == "elastic":
-        batch_times = multistream_benchmark(pipeline, inputs, seconds_to_run, num_streams)
+        multistream_benchmark(pipeline, inputs, warmup_time, num_streams)
+        batch_times = multistream_benchmark(
+            pipeline, inputs, seconds_to_run, num_streams
+        )
     else:
         raise Exception(f"Unknown scenario '{scenario}'")
 
     end_time = time.perf_counter()
     total_run_time = end_time - start_time
     if len(batch_times) == 0:
-        raise Exception("Generated no batch timings, try extending benchmark time with '--time'")
+        raise Exception(
+            "Generated no batch timings, try extending benchmark time with '--time'"
+        )
 
     return batch_times, total_run_time
 
+
 def calculate_statistics(batch_times_ms: List[float], total_run_time_ms: float) -> Dict:
     percentiles = [25.0, 50.0, 75.0, 90.0, 95.0, 99.0, 99.9]
     buckets = numpy.percentile(batch_times_ms, percentiles).tolist()
     percentiles_dict = {
         "{:2.1f}%".format(key): value for key, value in zip(percentiles, buckets)
     }
-    
+
     benchmark_dict = {
         "total_percentage": sum(batch_times_ms) / total_run_time_ms * 100,
         "median": numpy.median(batch_times_ms),
@@ -421,7 +477,10 @@ def calculate_statistics(batch_times_ms: List[float], total_run_time_ms: float)
     }
     return benchmark_dict
 
-def calculate_section_stats(batch_times: List[StagedTimer], total_run_time: float) -> Dict[str, Dict]:
+
+def calculate_section_stats(
+    batch_times: List[StagedTimer], total_run_time: float
+) -> Dict[str, Dict]:
     compute_sections = batch_times[0].stages
     total_run_time_ms = total_run_time * 1000
 
@@ -438,33 +497,33 @@ def main():
 
     print("Original Model Path: {}".format(args.model_path))
     print("Task: {}".format(args.task_name))
-    print("Input Type: {}".format(args.input_type))
     print("Batch Size: {}".format(args.batch_size))
     print("Scenario: {}".format(args.scenario))
 
     batch_times, total_run_time = benchmark_pipeline(
         model_path=args.model_path,
         task=args.task_name,
-        input_config = args.input_config,
+        input_config=args.input_config,
         batch_size=args.batch_size,
         num_cores=args.num_cores,
         scenario=args.scenario,
         seconds_to_run=args.time,
+        warmup_time=args.warmup_time,
         num_streams=args.num_streams,
         thread_pinning=args.thread_pinning,
+        engine=args.engine,
         quiet=args.quiet,
     )
 
     section_stats = calculate_section_stats(batch_times, total_run_time)
     items_per_sec = (len(batch_times) * args.batch_size) / total_run_time
 
-
     export_dict = {
         "scenario": args.scenario,
         "items_per_sec": items_per_sec,
         "seconds_ran": total_run_time,
         "iterations": len(batch_times),
-        "compute_sections": section_stats
+        "compute_sections": section_stats,
     }
 
     # Export results
@@ -479,20 +538,21 @@ def main():
     print("Batch Size: {}".format(args.batch_size))
     print("Scenario: {}".format(args.scenario))
     print("Iterations: {}".format(int(export_dict["iterations"])))
-    print(
-        "Throughput (items/sec): {:.4f}".format(
-            export_dict["items_per_sec"]
-        )
-    )
+    print("Throughput (items/sec): {:.4f}".format(export_dict["items_per_sec"]))
 
     print("Processing Time Breakdown: ")
     compute_sections = batch_times[0].stages
     for section in compute_sections:
-        print("     {}: {:.2f}%".format(section, section_stats[section]["total_percentage"]))
-    
+        print(
+            "     {}: {:.2f}%".format(
+                section, section_stats[section]["total_percentage"]
+            )
+        )
+
     print("Mean Latency Breakdown (ms/batch): ")
     for section in compute_sections:
         print("     {}: {:.4f}".format(section, section_stats[section]["mean"]))
 
+
 if __name__ == "__main__":
     main()
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index 14c90653a6..d0ccb95295 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -1,14 +1,30 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from deepsparse import Scheduler
 
+
 __all__ = [
     "decide_thread_pinning",
     "parse_scheduler",
     "parse_scenario",
-    "parse_num_streams"
+    "parse_num_streams",
 ]
 
+
 def decide_thread_pinning(pinning_mode: str, logger: object) -> None:
     pinning_mode = pinning_mode.lower()
     if pinning_mode in "core":
@@ -29,6 +45,7 @@ def decide_thread_pinning(pinning_mode: str, logger: object) -> None:
             )
         )
 
+
 def parse_scheduler(scenario: str) -> Scheduler:
     scenario = scenario.lower()
     if scenario == "multistream":
@@ -39,7 +56,8 @@ def parse_scheduler(scenario: str) -> Scheduler:
         return Scheduler.elastic
     else:
         return Scheduler.multi_stream
-    
+
+
 def parse_scenario(scenario: str, logger: object) -> str:
     scenario = scenario.lower()
     if scenario == "async":
@@ -55,7 +73,8 @@ def parse_scenario(scenario: str, logger: object) -> str:
             )
         )
         return "multistream"
-    
+
+
 def parse_num_streams(num_streams: int, num_cores: int, scenario: str, logger: object):
     # If model.num_streams is set, and the scenario is either "multi_stream" or
     # "elastic", use the value of num_streams given to us by the model, otherwise

From 75f5173a46d7196785f6be76e7cacab3cd0289a1 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 28 Jul 2023 11:40:23 -0400
Subject: [PATCH 12/37] moving code around, update README

---
 src/deepsparse/benchmark/README.md            |  12 +
 .../benchmark/benchmark_pipeline.py           | 262 +++++-------------
 src/deepsparse/benchmark/data_creation.py     | 170 ++++++++++++
 src/deepsparse/benchmark/helpers.py           |  13 +
 4 files changed, 271 insertions(+), 186 deletions(-)
 create mode 100644 src/deepsparse/benchmark/data_creation.py

diff --git a/src/deepsparse/benchmark/README.md b/src/deepsparse/benchmark/README.md
index 6bcb3477ec..f530c0255b 100644
--- a/src/deepsparse/benchmark/README.md
+++ b/src/deepsparse/benchmark/README.md
@@ -254,4 +254,16 @@ Additional arguments to the pipeline or input_schema can be added to the `pipeli
         "conf_thres": 0.7
     }
 } 
+```
+
+### Example Usage
+
+Running image classification for 30 seconds with a batch size of 32:
+```
+python benchmark_pipeline.py image_classification zoo:cv/classification/resnet_v1-50_2x/pytorch/sparseml/imagenet/base-none -c config.json -t 60 -b 32
+```
+
+Running text generation for 30 seconds asynchronously 
+```
+python benchmark_pipeline.py text_generation image_classification zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none -c config.json -t 30 -s async
 ```
\ No newline at end of file
diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index a791b11853..1ff586253b 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -13,21 +13,28 @@
 # limitations under the License.
 
 import argparse
-import glob
 import json
 import logging
 import queue
-import random
-import string
 import threading
 import time
 from typing import Dict, List, Tuple
 
 import numpy
 
-from deepsparse import Pipeline
+from deepsparse import Pipeline, __version__
+from deepsparse.benchmark.data_creation import (
+    generate_image_data,
+    generate_question_data,
+    generate_text_data,
+    get_input_schema_type,
+    load_image_data,
+    load_question_data,
+    load_text_data,
+)
 from deepsparse.benchmark.helpers import (
     decide_thread_pinning,
+    parse_input_config,
     parse_num_streams,
     parse_scenario,
     parse_scheduler,
@@ -45,9 +52,6 @@
 DEEPSPARSE_ENGINE = "deepsparse"
 ORT_ENGINE = "onnxruntime"
 
-DEFAULT_STRING_LENGTH = 50
-DEFAULT_IMAGE_SHAPE = (240, 240, 3)
-
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Benchmark DeepSparse Pipelines")
@@ -218,147 +222,56 @@ def multistream_benchmark(
     return list(time_queue.queue)
 
 
-def parse_input_config(input_config_file: str) -> Dict[str, any]:
-    config_file = open(input_config_file)
-    config = json.load(config_file)
-    config_file.close()
-    return config
-
-
-def get_files_with_endings(
-    folder: str, num_files: int, recursive: bool, file_endings: List[str]
-) -> List[str]:
-    files = []
-    for f in glob.glob(folder + "/**", recursivere=recursive):
-        if f.lower().endswith(file_endings):
-            files.append(f)
-    if len(files) < num_files:
-        raise Exception("Not enough images found in {}".format(folder))
-    return random.sample(files, num_files)
-
-
-def generate_sentence(string_length: int, avg_word_length: int = 5):
-    random_chars = "".join(random.choices(string.ascii_letters, k=string_length))
-    space_locations = random.sample(
-        range(string_length), int(string_length / avg_word_length)
-    )
-    random_chars = list(random_chars)
-    for loc in space_locations:
-        random_chars[loc] = " "
-    return "".join(random_chars)
-
-
-def get_input_schema_type(pipeline: Pipeline) -> str:
-    input_schema_requirements = list(pipeline.input_schema.__fields__.keys())
-    input_schema_fields = pipeline.input_schema.__fields__
-
-    if "images" in input_schema_requirements:
-        return "image"
-    if "sequences" in input_schema_requirements:
-        sequence_types = [
-            f.outer_type_ for f in input_schema_fields["sequences"].sub_fields
-        ]
-        if List[str] in sequence_types:
-            return "text_sequence"
-    elif "inputs" in input_schema_requirements:
-        sequence_types = [
-            f.outer_type_ for f in input_schema_fields["inputs"].sub_fields
-        ]
-        if List[str] in sequence_types:
-            return "text_inputs"
-    elif "question" in input_schema_requirements:
-        return "question"
-
-    raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
-
-
-def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
-    input_data = []
-    if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
-        image_shape = config["input_image_shape"]
-    else:
-        image_shape = DEFAULT_IMAGE_SHAPE
-        _LOGGER.warning("Using default image shape {}".format(image_shape))
-
-    for _ in range(batch_size):
-        rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
-            numpy.uint8
-        )
-        input_data.append(rand_array)
-
-    return input_data
-
-
-def load_image_data(config: Dict, batch_size: int) -> List[str]:
-    path_to_data = config["data_folder"]
-    recursive_search = config["recursive_search"]
-    return get_files_with_endings(
-        path_to_data, batch_size, recursive_search, [".jpg", ".jpeg", ".gif"]
-    )
-
-
-def generate_text_data(config: Dict, batch_size: int) -> List[str]:
-    input_data = []
-    if "gen_sequence_length" in config:
-        string_length = config["gen_sequence_length"]
-    else:
-        string_length = DEFAULT_STRING_LENGTH
-        _LOGGER.warning("Using default string length {}".format(string_length))
-    for _ in range(batch_size):
-        rand_sentence = generate_sentence(string_length)
-        input_data.append(rand_sentence)
-
-    return input_data
-
+def create_input_schema(
+    pipeline: Pipeline, input_type: str, batch_size: int, config: Dict
+) -> any:
+    input_schema_requirement = get_input_schema_type(pipeline)
+    kwargs = {}
+    if "input_schema_kwargs" in config:
+        kwargs = config["input_schema_kwargs"]
 
-def load_text_data(config: Dict, batch_size: int) -> List[str]:
-    path_to_data = config["data_folder"]
-    recursive_search = config["recursive_search"]
-    input_files = get_files_with_endings(
-        path_to_data, batch_size, recursive_search, [".txt"]
-    )
-    if "max_string_length" in config:
-        max_string_length = config["max_string_length"]
-    else:
-        max_string_length = -1
-        _LOGGER.warning("Using default max string length {}".format(max_string_length))
-    input_data = []
-    for f_path in input_files:
-        f = open(f_path)
-        text_data = f.read()
-        f.close()
-        input_data.append(text_data[:max_string_length])
-    return input_data
-
-
-def generate_question_data(config: Dict) -> Tuple[str, str]:
-    if "gen_sequence_length" in config:
-        string_length = config["gen_sequence_length"]
+    if input_type == "dummy":
+        if input_schema_requirement == "image":
+            input_data = generate_image_data(config, batch_size, _LOGGER)
+            inputs = pipeline.input_schema(images=input_data, **kwargs)
+        elif input_schema_requirement == "text_sequence":
+            input_data = generate_text_data(config, batch_size, _LOGGER)
+            inputs = pipeline.input_schema(sequences=input_data, **kwargs)
+        elif input_schema_requirement == "text_inputs":
+            input_data = generate_text_data(config, batch_size, _LOGGER)
+            inputs = pipeline.input_schema(inputs=input_data, **kwargs)
+        elif input_schema_requirement == "question":
+            _LOGGER.warn(
+                "Only batch size of 1 supported for Question Answering Pipeline"
+            )
+            question, context = generate_question_data(config, _LOGGER)
+            inputs = pipeline.input_schema(question=question, context=context, **kwargs)
+    elif input_type == "real":
+        if input_schema_requirement == "image":
+            input_data = load_image_data(config, batch_size)
+            inputs = pipeline.input_schema(images=input_data, **kwargs)
+        elif input_schema_requirement == "text_sequence":
+            input_data = load_text_data(config, _LOGGER)
+            inputs = pipeline.input_schema(sequences=input_data, **kwargs)
+        elif input_schema_requirement == "text_inputs":
+            input_data = load_text_data(config, batch_size, _LOGGER)
+            inputs = pipeline.input_schema(inputs=input_data, **kwargs)
+        elif input_schema_requirement == "question":
+            _LOGGER.warn(
+                "Only batch size of 1 supported for Question Answering Pipeline"
+            )
+            question, context = load_question_data(config)
+            inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     else:
-        string_length = DEFAULT_STRING_LENGTH
-        _LOGGER.warning("Using default string length {}".format(string_length))
-    question = generate_sentence(string_length)
-    context = generate_sentence(string_length)
-    return (question, context)
-
-
-def load_question_data(config: Dict) -> Tuple[str, str]:
-    path_to_questions = config["question_file"]
-    path_to_context = config["context_file"]
+        raise Exception(f"Unknown input type '{input_type}'")
 
-    f_question = open(path_to_questions)
-    f_context = open(path_to_context)
-    question = f_question.read()
-    context = f_context.read()
-    f_question.close()
-    f_context.close()
-    return question, context
+    return inputs
 
 
 def benchmark_pipeline(
     model_path: str,
     task: str,
-    input_config: str,
+    config: Dict,
     batch_size: int = 1,
     num_cores: int = None,
     scenario: str = "sync",
@@ -381,7 +294,6 @@ def benchmark_pipeline(
     scheduler = parse_scheduler(scenario)
     num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
 
-    config = parse_input_config(input_config)
     input_type = config["data_type"]
     kwargs = {}
     if "pipeline_kwargs" in config:
@@ -394,57 +306,21 @@ def benchmark_pipeline(
         num_cores=num_cores,
         **kwargs,
     )
-    input_schema_requirement = get_input_schema_type(pipeline)
-    kwargs = {}
-    if "input_schema_kwargs" in config:
-        kwargs = config["input_schema_kwargs"]
-
-    if input_type == "dummy":
-        if input_schema_requirement == "image":
-            input_data = generate_image_data(config, batch_size)
-            inputs = pipeline.input_schema(images=input_data, **kwargs)
-        elif input_schema_requirement == "text_sequence":
-            input_data = generate_text_data(config, batch_size)
-            inputs = pipeline.input_schema(sequences=input_data, **kwargs)
-        elif input_schema_requirement == "text_inputs":
-            input_data = generate_text_data(config, batch_size)
-            inputs = pipeline.input_schema(inputs=input_data, **kwargs)
-        elif input_schema_requirement == "question":
-            _LOGGER.warn(
-                "Only batch size of 1 supported for Question Answering Pipeline"
-            )
-            question, context = generate_question_data(config)
-            inputs = pipeline.input_schema(question=question, context=context, **kwargs)
-    elif input_type == "real":
-        if input_schema_requirement == "image":
-            input_data = load_image_data(config, batch_size)
-            inputs = pipeline.input_schema(images=input_data, **kwargs)
-        elif input_schema_requirement == "text_sequence":
-            input_data = load_text_data(config, batch_size)
-            inputs = pipeline.input_schema(sequences=input_data, **kwargs)
-        elif input_schema_requirement == "text_inputs":
-            input_data = load_text_data(config, batch_size)
-            inputs = pipeline.input_schema(inputs=input_data, **kwargs)
-        elif input_schema_requirement == "question":
-            _LOGGER.warn(
-                "Only batch size of 1 supported for Question Answering Pipeline"
-            )
-            question, context = load_question_data(config)
-            inputs = pipeline.input_schema(question=question, context=context, **kwargs)
-    else:
-        raise Exception(f"Unknown input type '{input_type}'")
+    inputs = create_input_schema(pipeline, input_type, batch_size, config)
 
-    start_time = time.perf_counter()
     if scenario == "singlestream":
         singlestream_benchmark(pipeline, inputs, warmup_time)
+        start_time = time.perf_counter()
         batch_times = singlestream_benchmark(pipeline, inputs, seconds_to_run)
     elif scenario == "multistream":
         multistream_benchmark(pipeline, inputs, warmup_time, num_streams)
+        start_time = time.perf_counter()
         batch_times = multistream_benchmark(
             pipeline, inputs, seconds_to_run, num_streams
         )
     elif scenario == "elastic":
         multistream_benchmark(pipeline, inputs, warmup_time, num_streams)
+        start_time = time.perf_counter()
         batch_times = multistream_benchmark(
             pipeline, inputs, seconds_to_run, num_streams
         )
@@ -494,6 +370,7 @@ def calculate_section_stats(
 
 def main():
     args = parse_args()
+    config = parse_input_config(args.input_config)
 
     print("Original Model Path: {}".format(args.model_path))
     print("Task: {}".format(args.task_name))
@@ -503,7 +380,7 @@ def main():
     batch_times, total_run_time = benchmark_pipeline(
         model_path=args.model_path,
         task=args.task_name,
-        input_config=args.input_config,
+        config=config,
         batch_size=args.batch_size,
         num_cores=args.num_cores,
         scenario=args.scenario,
@@ -518,14 +395,26 @@ def main():
     section_stats = calculate_section_stats(batch_times, total_run_time)
     items_per_sec = (len(batch_times) * args.batch_size) / total_run_time
 
-    export_dict = {
-        "scenario": args.scenario,
+    benchmark_results = {
         "items_per_sec": items_per_sec,
         "seconds_ran": total_run_time,
         "iterations": len(batch_times),
         "compute_sections": section_stats,
     }
 
+    export_dict = {
+        "engine": args.engine,
+        "version": __version__,
+        "model_path": args.model_path,
+        "batch_size": args.batch_size,
+        "num_cores": args.num_cores,
+        "scenario": args.scenario,
+        "seconds_to_run": time,
+        "num_streams": args.num_streams,
+        "input_config": config,
+        "benchmark_results": benchmark_results,
+    }
+
     # Export results
     export_path = args.export_path
     if export_path:
@@ -537,8 +426,9 @@ def main():
     print("Original Model Path: {}".format(args.model_path))
     print("Batch Size: {}".format(args.batch_size))
     print("Scenario: {}".format(args.scenario))
-    print("Iterations: {}".format(int(export_dict["iterations"])))
-    print("Throughput (items/sec): {:.4f}".format(export_dict["items_per_sec"]))
+    print("Iterations: {}".format(int(benchmark_results["iterations"])))
+    print("Total Runtime: {:.4f}".format(total_run_time))
+    print("Throughput (items/sec): {:.4f}".format(benchmark_results["items_per_sec"]))
 
     print("Processing Time Breakdown: ")
     compute_sections = batch_times[0].stages
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
new file mode 100644
index 0000000000..6b1036e98e
--- /dev/null
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import random
+import string
+from typing import Dict, List, Tuple
+
+import numpy
+
+from deepsparse import Pipeline
+
+
+DEFAULT_STRING_LENGTH = 50
+DEFAULT_IMAGE_SHAPE = (240, 240, 3)
+
+__all__ = [
+    "get_input_schema_type",
+    "get_files_with_endings",
+    "generate_sentence",
+    "generate_image_data",
+    "load_image_data",
+    "generate_text_data",
+    "load_text_data",
+    "generate_question_data",
+    "load_question_data",
+]
+
+
+def get_input_schema_type(pipeline: Pipeline) -> str:
+    input_schema_requirements = list(pipeline.input_schema.__fields__.keys())
+    input_schema_fields = pipeline.input_schema.__fields__
+
+    if "images" in input_schema_requirements:
+        return "image"
+    if "sequences" in input_schema_requirements:
+        sequence_types = [
+            f.outer_type_ for f in input_schema_fields["sequences"].sub_fields
+        ]
+        if List[str] in sequence_types:
+            return "text_sequence"
+    elif "inputs" in input_schema_requirements:
+        sequence_types = [
+            f.outer_type_ for f in input_schema_fields["inputs"].sub_fields
+        ]
+        if List[str] in sequence_types:
+            return "text_inputs"
+    elif "question" in input_schema_requirements:
+        return "question"
+
+    raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
+
+
+def get_files_with_endings(
+    folder: str, num_files: int, recursive: bool, file_endings: List[str]
+) -> List[str]:
+    files = []
+    for f in glob.glob(folder + "/**", recursivere=recursive):
+        if f.lower().endswith(file_endings):
+            files.append(f)
+    if len(files) < num_files:
+        raise Exception("Not enough images found in {}".format(folder))
+    return random.sample(files, num_files)
+
+
+def generate_sentence(string_length: int, avg_word_length: int = 5):
+    random_chars = "".join(random.choices(string.ascii_letters, k=string_length))
+    space_locations = random.sample(
+        range(string_length), int(string_length / avg_word_length)
+    )
+    random_chars = list(random_chars)
+    for loc in space_locations:
+        random_chars[loc] = " "
+    return "".join(random_chars)
+
+
+def generate_image_data(
+    config: Dict, batch_size: int, logger: object
+) -> List[numpy.ndarray]:
+    input_data = []
+    if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
+        image_shape = config["input_image_shape"]
+    else:
+        image_shape = DEFAULT_IMAGE_SHAPE
+        logger.warning("Using default image shape {}".format(image_shape))
+
+    for _ in range(batch_size):
+        rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
+            numpy.uint8
+        )
+        input_data.append(rand_array)
+
+    return input_data
+
+
+def load_image_data(config: Dict, batch_size: int) -> List[str]:
+    path_to_data = config["data_folder"]
+    recursive_search = config["recursive_search"]
+    return get_files_with_endings(
+        path_to_data, batch_size, recursive_search, [".jpg", ".jpeg", ".gif"]
+    )
+
+
+def generate_text_data(config: Dict, batch_size: int, logger: object) -> List[str]:
+    input_data = []
+    if "gen_sequence_length" in config:
+        string_length = config["gen_sequence_length"]
+    else:
+        string_length = DEFAULT_STRING_LENGTH
+        logger.warning("Using default string length {}".format(string_length))
+    for _ in range(batch_size):
+        rand_sentence = generate_sentence(string_length)
+        input_data.append(rand_sentence)
+
+    return input_data
+
+
+def load_text_data(config: Dict, batch_size: int, logger: object) -> List[str]:
+    path_to_data = config["data_folder"]
+    recursive_search = config["recursive_search"]
+    input_files = get_files_with_endings(
+        path_to_data, batch_size, recursive_search, [".txt"]
+    )
+    if "max_string_length" in config:
+        max_string_length = config["max_string_length"]
+    else:
+        max_string_length = -1
+        logger.warning("Using default max string length {}".format(max_string_length))
+    input_data = []
+    for f_path in input_files:
+        f = open(f_path)
+        text_data = f.read()
+        f.close()
+        input_data.append(text_data[:max_string_length])
+    return input_data
+
+
+def generate_question_data(config: Dict, logger: object) -> Tuple[str, str]:
+    if "gen_sequence_length" in config:
+        string_length = config["gen_sequence_length"]
+    else:
+        string_length = DEFAULT_STRING_LENGTH
+        logger.warning("Using default string length {}".format(string_length))
+    question = generate_sentence(string_length)
+    context = generate_sentence(string_length)
+    return (question, context)
+
+
+def load_question_data(config: Dict) -> Tuple[str, str]:
+    path_to_questions = config["question_file"]
+    path_to_context = config["context_file"]
+
+    f_question = open(path_to_questions)
+    f_context = open(path_to_context)
+    question = f_question.read()
+    context = f_context.read()
+    f_question.close()
+    f_context.close()
+    return question, context
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index d0ccb95295..50d3bced2b 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -12,16 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
+from typing import Dict
 
 from deepsparse import Scheduler
 
 
+DEFAULT_STRING_LENGTH = 50
+DEFAULT_IMAGE_SHAPE = (240, 240, 3)
+
 __all__ = [
     "decide_thread_pinning",
     "parse_scheduler",
     "parse_scenario",
     "parse_num_streams",
+    "parse_input_config",
 ]
 
 
@@ -95,3 +101,10 @@ def parse_num_streams(num_streams: int, num_cores: int, scenario: str, logger: o
                 )
             )
             return default_num_streams
+
+
+def parse_input_config(input_config_file: str) -> Dict[str, any]:
+    config_file = open(input_config_file)
+    config = json.load(config_file)
+    config_file.close()
+    return config

From 9202a6fdc46c6aec38abd925fe4b893b9b8ff023 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 28 Jul 2023 15:59:03 -0400
Subject: [PATCH 13/37] adding unit tests

---
 setup.py                                      |   1 +
 src/deepsparse/benchmark/README.md            |  12 +-
 src/deepsparse/benchmark/benchmark_model.py   |   6 +-
 .../benchmark/benchmark_pipeline.py           | 111 ++++++++++---
 src/deepsparse/benchmark/data_creation.py     |  54 +++---
 src/deepsparse/benchmark/helpers.py           |  23 +--
 tests/test_pipeline_benchmark.py              | 154 ++++++++++++++++++
 7 files changed, 300 insertions(+), 61 deletions(-)
 create mode 100644 tests/test_pipeline_benchmark.py

diff --git a/setup.py b/setup.py
index 8e425bd816..84289d3f6b 100644
--- a/setup.py
+++ b/setup.py
@@ -292,6 +292,7 @@ def _setup_entry_points() -> Dict:
             "deepsparse.analyze=deepsparse.analyze:main",
             "deepsparse.check_hardware=deepsparse.cpu:print_hardware_capability",
             "deepsparse.benchmark=deepsparse.benchmark.benchmark_model:main",
+            "deepsparse.benchmark_pipeline=deepsparse.benchmark.benchmark_pipeline:main",  # noqa E501
             "deepsparse.benchmark_sweep=deepsparse.benchmark.benchmark_sweep:main",
             "deepsparse.server=deepsparse.server.cli:main",
             "deepsparse.object_detection.annotate=deepsparse.yolo.annotate:main",
diff --git a/src/deepsparse/benchmark/README.md b/src/deepsparse/benchmark/README.md
index f530c0255b..7912abe18c 100644
--- a/src/deepsparse/benchmark/README.md
+++ b/src/deepsparse/benchmark/README.md
@@ -189,10 +189,10 @@ Iterations: 622
 ```
 
 ## 📜 Benchmarking Pipelines
-Expanding on the model benchmarking script, the pipeline benchmarker is a tool for benchmarking end-to-end inference, including pre and post processing. The script can generate fake input data based on the pipeline's input schema, or load it from a local folder. The pipeline then runs pre-processing, engine inference and post-processing. Benchmarking results are reported by section, useful for identifying bottlenecks. 
+Expanding on the model benchmarking script, `deepsparse.benchmark_pipeline` is a tool for benchmarking end-to-end inference, including pre and post processing. The script can generate fake input data based on the pipeline's input schema, or load it from a local folder. The pipeline then runs pre-processing, engine inference and post-processing. Benchmarking results are reported by section, useful for identifying bottlenecks. 
 
 ### Usage 
-Input arguments are the same as the Engine benchmarker, but with two addtions:
+Input arguments are the same as the Engine benchmarker, but with two additions:
 
 ```
 positional arguments:
@@ -258,12 +258,12 @@ Additional arguments to the pipeline or input_schema can be added to the `pipeli
 
 ### Example Usage
 
-Running image classification for 30 seconds with a batch size of 32:
+Running ResNet image classification for 30 seconds with a batch size of 32:
 ```
-python benchmark_pipeline.py image_classification zoo:cv/classification/resnet_v1-50_2x/pytorch/sparseml/imagenet/base-none -c config.json -t 60 -b 32
+deepsparse.benchmark_pipeline image_classification zoo:cv/classification/resnet_v1-50_2x/pytorch/sparseml/imagenet/base-none -c config.json -t 60 -b 32
 ```
 
-Running text generation for 30 seconds asynchronously 
+Running CodeGen text generation for 30 seconds asynchronously 
 ```
-python benchmark_pipeline.py text_generation image_classification zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none -c config.json -t 30 -s async
+deepsparse.benchmark_pipeline text_generation zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none -c config.json -t 30 -s async
 ```
\ No newline at end of file
diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 04fcdb8c7a..8c978ce87c 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -282,15 +282,15 @@ def benchmark_model(
     if num_cores is None:
         num_cores = cpu_architecture().num_available_physical_cores
 
-    decide_thread_pinning(thread_pinning, _LOGGER)
+    decide_thread_pinning(thread_pinning)
 
-    scenario = parse_scenario(scenario.lower(), _LOGGER)
+    scenario = parse_scenario(scenario.lower())
     scheduler = parse_scheduler(scenario)
     input_shapes = parse_input_shapes(input_shapes)
 
     orig_model_path = model_path
     model_path = model_to_path(model_path)
-    num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
+    num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
     # Compile the ONNX into a runnable model
     if engine == DEEPSPARSE_ENGINE:
diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 1ff586253b..9cb8a9ed25 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -12,6 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Benchmark DeepSparse Pipelines
+
+##########
+Command help:
+usage: deepsparse.benchmark_pipeline [-h] [-c INPUT_CONFIG] [-b BATCH_SIZE]
+                                     [-ncores NUM_CORES] [-s {async,sync,elastic}]
+                                     [-t TIME] [-w WARMUP_TIME] [-nstreams NUM_STREAMS]
+                                     [-pin {none,core,numa}] [-e ENGINE]
+                                     [-q] [-x EXPORT_PATH] task_name model_path
+
+positional arguments:
+  task_name             Type of pipeline to run
+  model_path            Path to an ONNX model file or SparseZoo model stub
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -c INPUT_CONFIG, --input_config INPUT_CONFIG
+                        JSON file containing schema for input data
+  -b BATCH_SIZE, --batch_size BATCH_SIZE
+                        The batch size to run the analysis for. Must be greater than 0
+  -ncores NUM_CORES, --num_cores NUM_CORES
+                        The number of physical cores to run the analysis on,
+                        defaults to all physical cores available on the system.
+  -s {async,sync,elastic}, --scenario {async,sync,elastic}
+                        Choose between using the async, sync and elastic
+                        scenarios. Sync and async are similar to the single-
+                        stream/multi-stream scenarios. Elastic is a newer
+                        scenario that behaves similarly to the async scenario
+                        but uses a different scheduling backend. Default value
+                        is sync.
+  -t TIME, --time TIME  The number of seconds the benchmark will run. Default
+                        is 10 seconds.
+  -w WARMUP_TIME, --warmup_time WARMUP_TIME
+                        The number of seconds the benchmark will warmup before
+                        running.Default is 2 seconds.
+  -nstreams NUM_STREAMS, --num_streams NUM_STREAMS
+                        The number of streams that will submit inferences in
+                        parallel using async scenario. Default is
+                        automatically determined for given hardware and may be
+                        sub-optimal.
+  -pin {none,core,numa}, --thread_pinning {none,core,numa}
+                        Enable binding threads to cores ('core' the default),
+                        threads to cores on sockets ('numa'), or disable
+                        ('none').
+  -e {deepsparse,onnxruntime}, --engine {deepsparse,onnxruntime}
+                        Inference engine backend to run eval on. Choices are
+                        'deepsparse', 'onnxruntime'. Default is 'deepsparse'.
+  -q, --quiet           Lower logging verbosity.
+  -x EXPORT_PATH, --export_path EXPORT_PATH
+                        Store results into a JSON file.
+
+##########
+Example ResNet image classification for 30 seconds with a batch size of 32:
+```
+deepsparse.benchmark_pipeline \
+    image_classification \
+    zoo:cv/classification/resnet_v1-50_2x/pytorch/sparseml/imagenet/base-none \
+    -c config.json -t 60 -b 32
+
+##########
+Example CodeGen text generation for 30 seconds asynchronously
+deepsparse.benchmark_pipeline \
+    text_generation \
+    zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/
+    bigpython_bigquery_thepile/pruned50-none \
+    -c config.json -t 30 -s async
+"""
+
 import argparse
 import json
 import logging
@@ -24,6 +93,7 @@
 
 from deepsparse import Pipeline, __version__
 from deepsparse.benchmark.data_creation import (
+    SchemaType,
     generate_image_data,
     generate_question_data,
     generate_text_data,
@@ -52,6 +122,9 @@
 DEEPSPARSE_ENGINE = "deepsparse"
 ORT_ENGINE = "onnxruntime"
 
+DUMMY_INPUT_TYPE = "dummy"
+REAL_INPUT_TYPE = "real"
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Benchmark DeepSparse Pipelines")
@@ -230,33 +303,33 @@ def create_input_schema(
     if "input_schema_kwargs" in config:
         kwargs = config["input_schema_kwargs"]
 
-    if input_type == "dummy":
-        if input_schema_requirement == "image":
-            input_data = generate_image_data(config, batch_size, _LOGGER)
+    if input_type == DUMMY_INPUT_TYPE:
+        if input_schema_requirement == SchemaType.IMAGE:
+            input_data = generate_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data, **kwargs)
-        elif input_schema_requirement == "text_sequence":
-            input_data = generate_text_data(config, batch_size, _LOGGER)
+        elif input_schema_requirement == SchemaType.TEXT_SEQ:
+            input_data = generate_text_data(config, batch_size)
             inputs = pipeline.input_schema(sequences=input_data, **kwargs)
-        elif input_schema_requirement == "text_inputs":
-            input_data = generate_text_data(config, batch_size, _LOGGER)
+        elif input_schema_requirement == SchemaType.TEXT_INPUT:
+            input_data = generate_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
-        elif input_schema_requirement == "question":
+        elif input_schema_requirement == SchemaType.QUESTION:
             _LOGGER.warn(
                 "Only batch size of 1 supported for Question Answering Pipeline"
             )
-            question, context = generate_question_data(config, _LOGGER)
+            question, context = generate_question_data(config)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
-    elif input_type == "real":
-        if input_schema_requirement == "image":
+    elif input_type == REAL_INPUT_TYPE:
+        if input_schema_requirement == SchemaType.IMAGE:
             input_data = load_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data, **kwargs)
-        elif input_schema_requirement == "text_sequence":
-            input_data = load_text_data(config, _LOGGER)
+        elif input_schema_requirement == SchemaType.TEXT_SEQ:
+            input_data = load_text_data(config)
             inputs = pipeline.input_schema(sequences=input_data, **kwargs)
-        elif input_schema_requirement == "text_inputs":
-            input_data = load_text_data(config, batch_size, _LOGGER)
+        elif input_schema_requirement == SchemaType.TEXT_INPUT:
+            input_data = load_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
-        elif input_schema_requirement == "question":
+        elif input_schema_requirement == SchemaType.QUESTION:
             _LOGGER.warn(
                 "Only batch size of 1 supported for Question Answering Pipeline"
             )
@@ -289,10 +362,10 @@ def benchmark_pipeline(
     if num_cores is None:
         num_cores = cpu_architecture().num_available_physical_cores
 
-    decide_thread_pinning(thread_pinning, _LOGGER)
-    scenario = parse_scenario(scenario.lower(), _LOGGER)
+    decide_thread_pinning(thread_pinning)
+    scenario = parse_scenario(scenario.lower())
     scheduler = parse_scheduler(scenario)
-    num_streams = parse_num_streams(num_streams, num_cores, scenario, _LOGGER)
+    num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
     input_type = config["data_type"]
     kwargs = {}
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 6b1036e98e..502a6ca805 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import glob
+import logging
 import random
 import string
 from typing import Dict, List, Tuple
@@ -22,6 +23,8 @@
 from deepsparse import Pipeline
 
 
+_LOGGER = logging.getLogger(__name__)
+
 DEFAULT_STRING_LENGTH = 50
 DEFAULT_IMAGE_SHAPE = (240, 240, 3)
 
@@ -38,26 +41,33 @@
 ]
 
 
+class SchemaType:
+    IMAGE: str = "images"
+    TEXT_SEQ: str = "sequences"
+    TEXT_INPUT: str = "inputs"
+    QUESTION: str = "question"
+
+
 def get_input_schema_type(pipeline: Pipeline) -> str:
     input_schema_requirements = list(pipeline.input_schema.__fields__.keys())
     input_schema_fields = pipeline.input_schema.__fields__
 
-    if "images" in input_schema_requirements:
-        return "image"
-    if "sequences" in input_schema_requirements:
+    if SchemaType.IMAGE in input_schema_requirements:
+        return SchemaType.IMAGE
+    if SchemaType.TEXT_SEQ in input_schema_requirements:
         sequence_types = [
-            f.outer_type_ for f in input_schema_fields["sequences"].sub_fields
+            f.outer_type_ for f in input_schema_fields[SchemaType.TEXT_SEQ].sub_fields
         ]
         if List[str] in sequence_types:
-            return "text_sequence"
-    elif "inputs" in input_schema_requirements:
+            return SchemaType.TEXT_SEQ
+    elif SchemaType.TEXT_INPUT in input_schema_requirements:
         sequence_types = [
-            f.outer_type_ for f in input_schema_fields["inputs"].sub_fields
+            f.outer_type_ for f in input_schema_fields[SchemaType.TEXT_INPUT].sub_fields
         ]
         if List[str] in sequence_types:
-            return "text_inputs"
-    elif "question" in input_schema_requirements:
-        return "question"
+            return SchemaType.TEXT_INPUT
+    elif SchemaType.QUESTION in input_schema_requirements:
+        return SchemaType.QUESTION
 
     raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
 
@@ -85,15 +95,13 @@ def generate_sentence(string_length: int, avg_word_length: int = 5):
     return "".join(random_chars)
 
 
-def generate_image_data(
-    config: Dict, batch_size: int, logger: object
-) -> List[numpy.ndarray]:
+def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
     input_data = []
     if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
         image_shape = config["input_image_shape"]
     else:
         image_shape = DEFAULT_IMAGE_SHAPE
-        logger.warning("Using default image shape {}".format(image_shape))
+        _LOGGER.warning("Using default image shape {}".format(image_shape))
 
     for _ in range(batch_size):
         rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
@@ -112,21 +120,21 @@ def load_image_data(config: Dict, batch_size: int) -> List[str]:
     )
 
 
-def generate_text_data(config: Dict, batch_size: int, logger: object) -> List[str]:
+def generate_text_data(config: Dict, batch_size: int, avg_word_len=5) -> List[str]:
     input_data = []
     if "gen_sequence_length" in config:
         string_length = config["gen_sequence_length"]
     else:
         string_length = DEFAULT_STRING_LENGTH
-        logger.warning("Using default string length {}".format(string_length))
+        _LOGGER.warning("Using default string length {}".format(string_length))
     for _ in range(batch_size):
-        rand_sentence = generate_sentence(string_length)
+        rand_sentence = generate_sentence(string_length, avg_word_length=avg_word_len)
         input_data.append(rand_sentence)
 
     return input_data
 
 
-def load_text_data(config: Dict, batch_size: int, logger: object) -> List[str]:
+def load_text_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
     input_files = get_files_with_endings(
@@ -136,7 +144,7 @@ def load_text_data(config: Dict, batch_size: int, logger: object) -> List[str]:
         max_string_length = config["max_string_length"]
     else:
         max_string_length = -1
-        logger.warning("Using default max string length {}".format(max_string_length))
+        _LOGGER.warning("Using default max string length {}".format(max_string_length))
     input_data = []
     for f_path in input_files:
         f = open(f_path)
@@ -146,14 +154,14 @@ def load_text_data(config: Dict, batch_size: int, logger: object) -> List[str]:
     return input_data
 
 
-def generate_question_data(config: Dict, logger: object) -> Tuple[str, str]:
+def generate_question_data(config: Dict, avg_word_len=5) -> Tuple[str, str]:
     if "gen_sequence_length" in config:
         string_length = config["gen_sequence_length"]
     else:
         string_length = DEFAULT_STRING_LENGTH
-        logger.warning("Using default string length {}".format(string_length))
-    question = generate_sentence(string_length)
-    context = generate_sentence(string_length)
+        _LOGGER.warning("Using default string length {}".format(string_length))
+    question = generate_sentence(string_length, avg_word_length=avg_word_len)
+    context = generate_sentence(string_length, avg_word_length=avg_word_len)
     return (question, context)
 
 
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index 50d3bced2b..6702d269d7 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 
 import json
+import logging
 import os
 from typing import Dict
 
 from deepsparse import Scheduler
 
 
+_LOGGER = logging.getLogger(__name__)
+
 DEFAULT_STRING_LENGTH = 50
 DEFAULT_IMAGE_SHAPE = (240, 240, 3)
 
@@ -31,21 +34,21 @@
 ]
 
 
-def decide_thread_pinning(pinning_mode: str, logger: object) -> None:
+def decide_thread_pinning(pinning_mode: str) -> None:
     pinning_mode = pinning_mode.lower()
     if pinning_mode in "core":
         os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
-        logger.info("Thread pinning to cores enabled")
+        _LOGGER.info("Thread pinning to cores enabled")
     elif pinning_mode in "numa":
         os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
         os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "1"
-        logger.info("Thread pinning to socket/numa nodes enabled")
+        _LOGGER.info("Thread pinning to socket/numa nodes enabled")
     elif pinning_mode in "none":
         os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
         os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "0"
-        logger.info("Thread pinning disabled, performance may be sub-optimal")
+        _LOGGER.info("Thread pinning disabled, performance may be sub-optimal")
     else:
-        logger.info(
+        _LOGGER.info(
             "Recieved invalid option for thread_pinning '{}', skipping".format(
                 pinning_mode
             )
@@ -64,7 +67,7 @@ def parse_scheduler(scenario: str) -> Scheduler:
         return Scheduler.multi_stream
 
 
-def parse_scenario(scenario: str, logger: object) -> str:
+def parse_scenario(scenario: str) -> str:
     scenario = scenario.lower()
     if scenario == "async":
         return "multistream"
@@ -73,7 +76,7 @@ def parse_scenario(scenario: str, logger: object) -> str:
     elif scenario == "elastic":
         return "elastic"
     else:
-        logger.info(
+        _LOGGER.info(
             "Recieved invalid option for scenario'{}', defaulting to async".format(
                 scenario
             )
@@ -81,20 +84,20 @@ def parse_scenario(scenario: str, logger: object) -> str:
         return "multistream"
 
 
-def parse_num_streams(num_streams: int, num_cores: int, scenario: str, logger: object):
+def parse_num_streams(num_streams: int, num_cores: int, scenario: str):
     # If model.num_streams is set, and the scenario is either "multi_stream" or
     # "elastic", use the value of num_streams given to us by the model, otherwise
     # use a semi-sane default value.
     if scenario == "sync" or scenario == "singlestream":
         if num_streams and num_streams > 1:
-            logger.info("num_streams reduced to 1 for singlestream scenario.")
+            _LOGGER.info("num_streams reduced to 1 for singlestream scenario.")
         return 1
     else:
         if num_streams:
             return num_streams
         else:
             default_num_streams = max(1, int(num_cores / 2))
-            logger.info(
+            _LOGGER.info(
                 "num_streams default value chosen of {}. "
                 "This requires tuning and may be sub-optimal".format(
                     default_num_streams
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
new file mode 100644
index 0000000000..f4b19f4edf
--- /dev/null
+++ b/tests/test_pipeline_benchmark.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List
+
+import numpy
+
+import pytest
+from deepsparse import Pipeline
+from deepsparse.benchmark.benchmark_pipeline import calculate_section_stats
+from deepsparse.benchmark.data_creation import (
+    SchemaType,
+    generate_image_data,
+    generate_question_data,
+    generate_text_data,
+    get_input_schema_type,
+)
+from deepsparse.utils import StagedTimer
+from tests.helpers import run_command
+
+
+@pytest.mark.parametrize(
+    ("pipeline_id", "model_stub", "additional_opts"),
+    [
+        (
+            "text_classification",
+            "zoo:nlp/sentiment_analysis/distilbert-none/pytorch/huggingface/"
+            "sst2/pruned90-none",
+            ["-c", "tests/test_data/pipeline_bench_config.json", "-b", "4"],
+        ),
+        (
+            "image_classification",
+            "zoo:cv/classification/resnet_v1-50_2x/pytorch/sparseml/imagenet/base-none",
+            ["-c", "tests/test_data/pipeline_bench_config.json", "-s", "async"],
+        ),
+        (
+            "question_answering",
+            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/"
+            "12layer_pruned80_quant-none-vnni",
+            [
+                "-c",
+                "tests/test_data/pipeline_bench_config.json",
+                "-t",
+                "5",
+                "-s",
+                "elastic",
+            ],
+        ),
+        (
+            "token_classification",
+            "zoo:nlp/token_classification/distilbert-none/pytorch/huggingface/"
+            "conll2003/pruned90-none",
+            ["-c", "tests/test_data/pipeline_bench_config.json", "-t", "3"],
+        ),
+    ],
+)
+def test_pipeline_benchmark(
+    pipeline_id: str, model_stub: str, additional_opts: List[str]
+):
+    cmd = [
+        "deepsparse.benchmark_pipeline",
+        pipeline_id,
+        model_stub,
+        *additional_opts,
+    ]
+    print(f"\n==== test_benchmark command ====\n{' '.join(cmd)}")
+    res = run_command(cmd)
+    if res.stdout is not None:
+        print(f"\n==== test_benchmark output ====\n{res.stdout}")
+    assert res.returncode == 0
+    assert "error" not in res.stdout.lower()
+    assert "fail" not in res.stdout.lower()
+
+
+def test_generate_image_data():
+    batch_size = 32
+    config = {"input_image_shape": (600, 600, 1)}
+    image_data = generate_image_data(config, batch_size)
+    assert len(image_data) == batch_size
+    img = image_data[0]
+    assert img.shape == (600, 600, 1)
+    assert img.dtype == numpy.uint8
+    assert numpy.max(img) < 255 and numpy.min(img) >= 0
+
+
+def test_generate_text_data():
+    batch_size = 16
+    avg_word_len = 8
+    config = {"gen_sequence_length": 250}
+    text_data = generate_text_data(config, batch_size, avg_word_len=avg_word_len)
+    assert len(text_data) == batch_size
+    text = text_data[0]
+    assert len(text) == 250
+    num_spaces = text.count(" ")
+    assert num_spaces == int(len(text) / avg_word_len)
+
+
+def test_generate_question_data():
+    avg_word_len = 10
+    config = {"gen_sequence_length": 50}
+    question, context = generate_question_data(config, avg_word_len=avg_word_len)
+    assert len(question) == config["gen_sequence_length"]
+    assert len(context) == config["gen_sequence_length"]
+    num_q_spaces = question.count(" ")
+    num_c_spaces = context.count(" ")
+    assert num_q_spaces == num_c_spaces == int(len(question) / avg_word_len)
+
+
+@pytest.mark.parametrize(
+    ("task_name", "input_schema"),
+    [
+        ("yolo", SchemaType.IMAGE),
+        ("text_classification", SchemaType.TEXT_SEQ),
+        ("transformers_embedding_extraction", SchemaType.TEXT_INPUT),
+        ("question_answering", SchemaType.QUESTION),
+    ],
+)
+def test_get_input_schema_type(task_name, input_schema):
+    pipeline = Pipeline.create(task=task_name)
+    assert get_input_schema_type(pipeline) == input_schema
+
+
+def test_calculations():
+    batch_times = []
+    for i in range(5):
+        timer = StagedTimer()
+        timer._staged_start_times["stage_1"] = [i + 0.1]
+        timer._staged_stop_times["stage_1"] = [i + 0.5]
+
+        timer._staged_start_times["stage_2"] = [i + 0.6]
+        timer._staged_stop_times["stage_2"] = [i + 0.9]
+
+        batch_times.append(timer)
+
+    total_run_time = 6.0
+    section_stats = calculate_section_stats(batch_times, total_run_time)
+    assert math.isclose(
+        section_stats["stage_1"]["total_percentage"], 33.33, rel_tol=0.05
+    )
+    assert math.isclose(section_stats["stage_2"]["total_percentage"], 25, rel_tol=0.05)
+    assert math.isclose(section_stats["stage_1"]["mean"], 400, rel_tol=0.05)
+    assert math.isclose(section_stats["stage_2"]["median"], 300, rel_tol=0.05)

From 2ed018574c83fe68caffd8cd34c13446623afb93 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 28 Jul 2023 16:22:10 -0400
Subject: [PATCH 14/37] adding missing test file

---
 tests/test_data/pipeline_bench_config.json | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 tests/test_data/pipeline_bench_config.json

diff --git a/tests/test_data/pipeline_bench_config.json b/tests/test_data/pipeline_bench_config.json
new file mode 100644
index 0000000000..5886762cea
--- /dev/null
+++ b/tests/test_data/pipeline_bench_config.json
@@ -0,0 +1,10 @@
+{
+    "data_type": "dummy",
+    "gen_sequence_length": 100,
+    "input_image_shape": [500,500,3],
+    "data_folder": "/home/sadkins/imagenette2-320/",
+    "recursive_search": true,
+    "max_string_length": -1,
+    "pipeline_kwargs": {},
+    "input_schema_kwargs": {}
+} 
\ No newline at end of file

From 729447e57652a3a81780f1511b3bd2b56c3efe5f Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 10:58:21 -0400
Subject: [PATCH 15/37] skipping test w/high memory usage

---
 tests/test_pipeline_benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index f4b19f4edf..cabee84196 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -31,6 +31,7 @@
 from tests.helpers import run_command
 
 
+@pytest.mark.skip(reason="High memory usage, causes GitHub test run to be killed")
 @pytest.mark.parametrize(
     ("pipeline_id", "model_stub", "additional_opts"),
     [

From abb4811fd8dcfb159b7778fed3cb56447153e811 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 11:06:13 -0400
Subject: [PATCH 16/37] skip test with high memory usage

---
 tests/test_pipeline_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index cabee84196..06235d789d 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -31,7 +31,6 @@
 from tests.helpers import run_command
 
 
-@pytest.mark.skip(reason="High memory usage, causes GitHub test run to be killed")
 @pytest.mark.parametrize(
     ("pipeline_id", "model_stub", "additional_opts"),
     [
@@ -67,6 +66,7 @@
         ),
     ],
 )
+@pytest.mark.skip(reason="High memory usage, causes GitHub test run to be killed")
 def test_pipeline_benchmark(
     pipeline_id: str, model_stub: str, additional_opts: List[str]
 ):

From 8cdbe9bbbe29248cc6058e910c6bc2c78b8f92fd Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 11:31:09 -0400
Subject: [PATCH 17/37] unit test memory

---
 tests/test_pipeline_benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index 06235d789d..75a6bc1960 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -128,6 +128,7 @@ def test_generate_question_data():
         ("question_answering", SchemaType.QUESTION),
     ],
 )
+@pytest.mark.skip(reason="High memory usage, causes GitHub test run to be killed")
 def test_get_input_schema_type(task_name, input_schema):
     pipeline = Pipeline.create(task=task_name)
     assert get_input_schema_type(pipeline) == input_schema

From 1058f0b6aa33fe7a0fcba2d6d7d70013407e35b4 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 11:56:39 -0400
Subject: [PATCH 18/37] add tests back in

---
 tests/test_pipeline_benchmark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index 75a6bc1960..ea749027e6 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -66,7 +66,6 @@
         ),
     ],
 )
-@pytest.mark.skip(reason="High memory usage, causes GitHub test run to be killed")
 def test_pipeline_benchmark(
     pipeline_id: str, model_stub: str, additional_opts: List[str]
 ):

From 249e6452484ebf552037f261f4abd1b9aa237775 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 11:59:24 -0400
Subject: [PATCH 19/37] add tests back in

---
 tests/test_pipeline_benchmark.py | 36 ++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index ea749027e6..aadf3afb71 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -38,31 +38,45 @@
             "text_classification",
             "zoo:nlp/sentiment_analysis/distilbert-none/pytorch/huggingface/"
             "sst2/pruned90-none",
-            ["-c", "tests/test_data/pipeline_bench_config.json", "-b", "4"],
+            [
+                "-c",
+                "tests/test_data/pipeline_bench_config.json",
+                "-b",
+                "4",
+                "-t",
+                "3",
+                "-w",
+                "0.5",
+            ],
         ),
         (
             "image_classification",
             "zoo:cv/classification/resnet_v1-50_2x/pytorch/sparseml/imagenet/base-none",
-            ["-c", "tests/test_data/pipeline_bench_config.json", "-s", "async"],
-        ),
-        (
-            "question_answering",
-            "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/"
-            "12layer_pruned80_quant-none-vnni",
             [
                 "-c",
                 "tests/test_data/pipeline_bench_config.json",
-                "-t",
-                "5",
                 "-s",
-                "elastic",
+                "async",
+                "-t",
+                "3",
+                "-w",
+                "0.5",
             ],
         ),
         (
             "token_classification",
             "zoo:nlp/token_classification/distilbert-none/pytorch/huggingface/"
             "conll2003/pruned90-none",
-            ["-c", "tests/test_data/pipeline_bench_config.json", "-t", "3"],
+            [
+                "-c",
+                "tests/test_data/pipeline_bench_config.json",
+                "-s",
+                "elastic",
+                "-t",
+                "3",
+                "-w",
+                "0.5",
+            ],
         ),
     ],
 )

From ba8688b4be776a1783cb08e883de1cf5b6ef67e6 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 12:50:11 -0400
Subject: [PATCH 20/37] fix async percentages

---
 src/deepsparse/benchmark/benchmark_pipeline.py | 18 +++++++++++-------
 tests/test_pipeline_benchmark.py               |  8 ++++----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 9cb8a9ed25..0f6b554df2 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -407,10 +407,12 @@ def benchmark_pipeline(
             "Generated no batch timings, try extending benchmark time with '--time'"
         )
 
-    return batch_times, total_run_time
+    return batch_times, total_run_time, num_streams
 
 
-def calculate_statistics(batch_times_ms: List[float], total_run_time_ms: float) -> Dict:
+def calculate_statistics(
+    batch_times_ms: List[float], total_run_time_ms: float, num_streams: int
+) -> Dict:
     percentiles = [25.0, 50.0, 75.0, 90.0, 95.0, 99.0, 99.9]
     buckets = numpy.percentile(batch_times_ms, percentiles).tolist()
     percentiles_dict = {
@@ -418,7 +420,7 @@ def calculate_statistics(batch_times_ms: List[float], total_run_time_ms: float)
     }
 
     benchmark_dict = {
-        "total_percentage": sum(batch_times_ms) / total_run_time_ms * 100,
+        "total_percentage": sum(batch_times_ms) / total_run_time_ms * 100 * num_streams,
         "median": numpy.median(batch_times_ms),
         "mean": numpy.mean(batch_times_ms),
         "std": numpy.std(batch_times_ms),
@@ -428,7 +430,7 @@ def calculate_statistics(batch_times_ms: List[float], total_run_time_ms: float)
 
 
 def calculate_section_stats(
-    batch_times: List[StagedTimer], total_run_time: float
+    batch_times: List[StagedTimer], total_run_time: float, num_streams: int
 ) -> Dict[str, Dict]:
     compute_sections = batch_times[0].stages
     total_run_time_ms = total_run_time * 1000
@@ -436,7 +438,9 @@ def calculate_section_stats(
     sections = {}
     for section in compute_sections:
         section_times = [st.times[section] * 1000 for st in batch_times]
-        sections[section] = calculate_statistics(section_times, total_run_time_ms)
+        sections[section] = calculate_statistics(
+            section_times, total_run_time_ms, num_streams
+        )
 
     return sections
 
@@ -450,7 +454,7 @@ def main():
     print("Batch Size: {}".format(args.batch_size))
     print("Scenario: {}".format(args.scenario))
 
-    batch_times, total_run_time = benchmark_pipeline(
+    batch_times, total_run_time, num_streams = benchmark_pipeline(
         model_path=args.model_path,
         task=args.task_name,
         config=config,
@@ -465,7 +469,7 @@ def main():
         quiet=args.quiet,
     )
 
-    section_stats = calculate_section_stats(batch_times, total_run_time)
+    section_stats = calculate_section_stats(batch_times, total_run_time, num_streams)
     items_per_sec = (len(batch_times) * args.batch_size) / total_run_time
 
     benchmark_results = {
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index aadf3afb71..698e50c927 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -46,7 +46,7 @@
                 "-t",
                 "3",
                 "-w",
-                "0.5",
+                "1",
             ],
         ),
         (
@@ -60,7 +60,7 @@
                 "-t",
                 "3",
                 "-w",
-                "0.5",
+                "1",
             ],
         ),
         (
@@ -75,7 +75,7 @@
                 "-t",
                 "3",
                 "-w",
-                "0.5",
+                "1",
             ],
         ),
     ],
@@ -160,7 +160,7 @@ def test_calculations():
         batch_times.append(timer)
 
     total_run_time = 6.0
-    section_stats = calculate_section_stats(batch_times, total_run_time)
+    section_stats = calculate_section_stats(batch_times, total_run_time, 1)
     assert math.isclose(
         section_stats["stage_1"]["total_percentage"], 33.33, rel_tol=0.05
     )

From ecf15590936db9f9928efedd7b2e779591f6255c Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 31 Jul 2023 14:51:23 -0400
Subject: [PATCH 21/37] fix new quality errors

---
 src/deepsparse/utils/data.py                              | 2 +-
 tests/deepsparse/pipelines/test_dynamic_batch_pipeline.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/utils/data.py b/src/deepsparse/utils/data.py
index df08fb5e0c..a8d463f7b9 100644
--- a/src/deepsparse/utils/data.py
+++ b/src/deepsparse/utils/data.py
@@ -101,7 +101,7 @@ def verify_outputs(
             raise Exception(
                 f"Output shapes don't match, {output.shape} != {gt_output.shape}"
             )
-        if type(output) != type(gt_output):
+        if type(output) is not type(gt_output):
             raise Exception(
                 f"Output types don't match, {type(output)} != {type(gt_output)}"
             )
diff --git a/tests/deepsparse/pipelines/test_dynamic_batch_pipeline.py b/tests/deepsparse/pipelines/test_dynamic_batch_pipeline.py
index 1d08fd9832..77f451da81 100644
--- a/tests/deepsparse/pipelines/test_dynamic_batch_pipeline.py
+++ b/tests/deepsparse/pipelines/test_dynamic_batch_pipeline.py
@@ -35,7 +35,7 @@
 
 
 def compare(expected, actual):
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
 
     if isinstance(expected, (list, float, numpy.ndarray)):
         expected_np = numpy.asarray(expected, dtype=float)

From e0f6ab34142fb99d4189f5515713e4cf20e9d9c6 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 1 Aug 2023 12:08:38 -0400
Subject: [PATCH 22/37] pass num_streams, fix percentage calculation for async

---
 src/deepsparse/benchmark/benchmark_pipeline.py | 4 +++-
 src/deepsparse/pipeline.py                     | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 0f6b554df2..80956ff18f 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -377,6 +377,7 @@ def benchmark_pipeline(
         engine_type=engine,
         scheduler=scheduler,
         num_cores=num_cores,
+        num_streams=num_streams,
         **kwargs,
     )
     inputs = create_input_schema(pipeline, input_type, batch_size, config)
@@ -419,8 +420,9 @@ def calculate_statistics(
         "{:2.1f}%".format(key): value for key, value in zip(percentiles, buckets)
     }
 
+    scaled_runtime = total_run_time_ms * num_streams
     benchmark_dict = {
-        "total_percentage": sum(batch_times_ms) / total_run_time_ms * 100 * num_streams,
+        "total_percentage": sum(batch_times_ms) / scaled_runtime * 100,
         "median": numpy.median(batch_times_ms),
         "mean": numpy.mean(batch_times_ms),
         "std": numpy.std(batch_times_ms),
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 8a88dacbca..f5ff41a894 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -121,6 +121,9 @@ class PipelineImplementation(Pipeline):
         dynamic batch mode (Pipeline will accept any batch size). Default is 1
     :param num_cores: number of CPU cores to allocate for inference engine. None
         specifies all available cores. Default is None
+    :param num_streams: The max number of requests the model can handle
+        concurrently. None or 0 implies a scheduler-defined default value;
+        default None
     :param scheduler: (deepsparse only) kind of scheduler to execute with.
         Pass None for the default
     :param input_shapes: list of shapes to set ONNX the inputs to. Pass None
@@ -146,6 +149,7 @@ def __init__(
         engine_type: str = DEEPSPARSE_ENGINE,
         batch_size: Optional[int] = 1,
         num_cores: int = None,
+        num_streams: int = None,
         scheduler: Scheduler = None,
         input_shapes: List[List[int]] = None,
         context: Optional[Context] = None,
@@ -181,6 +185,7 @@ def __init__(
             batch_size=self._batch_size or 1,  # bs=1 for dynamic batch
             num_cores=num_cores,
             input_shapes=input_shapes,
+            num_streams=num_streams,
         )
         if engine_type.lower() == DEEPSPARSE_ENGINE:
             self._engine_args["scheduler"] = scheduler

From 9473b79ac8ab4ebaa12d73e8f547c1193dfd24aa Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 1 Aug 2023 13:18:15 -0400
Subject: [PATCH 23/37] fix for file loading

---
 src/deepsparse/benchmark/data_creation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 502a6ca805..769cb73404 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -73,10 +73,10 @@ def get_input_schema_type(pipeline: Pipeline) -> str:
 
 
 def get_files_with_endings(
-    folder: str, num_files: int, recursive: bool, file_endings: List[str]
+    folder: str, num_files: int, recursive: bool, file_endings: Tuple[str]
 ) -> List[str]:
     files = []
-    for f in glob.glob(folder + "/**", recursivere=recursive):
+    for f in glob.glob(folder + "/**", recursive=recursive):
         if f.lower().endswith(file_endings):
             files.append(f)
     if len(files) < num_files:
@@ -116,7 +116,7 @@ def load_image_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
     return get_files_with_endings(
-        path_to_data, batch_size, recursive_search, [".jpg", ".jpeg", ".gif"]
+        path_to_data, batch_size, recursive_search, (".jpg", ".jpeg", ".gif")
     )
 
 
@@ -138,7 +138,7 @@ def load_text_data(config: Dict, batch_size: int) -> List[str]:
     path_to_data = config["data_folder"]
     recursive_search = config["recursive_search"]
     input_files = get_files_with_endings(
-        path_to_data, batch_size, recursive_search, [".txt"]
+        path_to_data, batch_size, recursive_search, (".txt")
     )
     if "max_string_length" in config:
         max_string_length = config["max_string_length"]

From cc8de6aefb9d9c01febfb53444a64c3337026d3e Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 1 Aug 2023 14:34:31 -0400
Subject: [PATCH 24/37] PR comments

---
 .../benchmark/benchmark_pipeline.py           | 39 ++++++++++++++-----
 src/deepsparse/benchmark/helpers.py           |  6 +--
 src/deepsparse/pipeline.py                    |  3 +-
 3 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 80956ff18f..9db297b25b 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -122,8 +122,10 @@
 DEEPSPARSE_ENGINE = "deepsparse"
 ORT_ENGINE = "onnxruntime"
 
-DUMMY_INPUT_TYPE = "dummy"
-REAL_INPUT_TYPE = "real"
+
+class PipelineInputType:
+    DUMMY: str = "dummy"
+    REAL: str = "real"
 
 
 def parse_args():
@@ -242,6 +244,13 @@ def parse_args():
 
 
 class PipelineExecutorThread(threading.Thread):
+    """
+    Run pipeline reoeatedly on inputs for max_time seconds, pushing the timer data to
+    the timer queue to store the runtime of each section of the pipeline.
+
+    For intended usage, see multistream_benchmark
+    """
+
     def __init__(
         self,
         pipeline: Pipeline,
@@ -264,6 +273,10 @@ def run(self):
 def singlestream_benchmark(
     pipeline: Pipeline, inputs: List[any], seconds_to_run: float
 ) -> List[StagedTimer]:
+    """
+    Run pipeline repeatedly on inputs for max_time seconds, storing the runtime of each
+    section of the pipeline in batch_timings
+    """
     benchmark_end_time = time.perf_counter() + seconds_to_run
     batch_timings = []
     while time.perf_counter() < benchmark_end_time:
@@ -279,6 +292,10 @@ def multistream_benchmark(
     seconds_to_run: float,
     num_streams: int,
 ) -> List[StagedTimer]:
+    """
+    Create num_streams threads, each of which calls PipelineExecutorThread.run() for
+    seconds_to_run seconds. Stores all timing info in a shared queue.
+    """
     time_queue = queue.Queue()
     max_time = time.perf_counter() + seconds_to_run
     threads = []
@@ -287,7 +304,7 @@ def multistream_benchmark(
         threads.append(PipelineExecutorThread(pipeline, inputs, time_queue, max_time))
 
     for thread in threads:
-        thread.start()
+        thread.start()  # triggers PipelineExecutorThread.run()
 
     for thread in threads:
         thread.join()
@@ -296,14 +313,14 @@ def multistream_benchmark(
 
 
 def create_input_schema(
-    pipeline: Pipeline, input_type: str, batch_size: int, config: Dict
+    pipeline: Pipeline, input_type: PipelineInputType, batch_size: int, config: Dict
 ) -> any:
     input_schema_requirement = get_input_schema_type(pipeline)
     kwargs = {}
     if "input_schema_kwargs" in config:
         kwargs = config["input_schema_kwargs"]
 
-    if input_type == DUMMY_INPUT_TYPE:
+    if input_type == PipelineInputType.DUMMY:
         if input_schema_requirement == SchemaType.IMAGE:
             input_data = generate_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data, **kwargs)
@@ -319,7 +336,7 @@ def create_input_schema(
             )
             question, context = generate_question_data(config)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
-    elif input_type == REAL_INPUT_TYPE:
+    elif input_type == PipelineInputType.REAL:
         if input_schema_requirement == SchemaType.IMAGE:
             input_data = load_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data, **kwargs)
@@ -367,6 +384,8 @@ def benchmark_pipeline(
     scheduler = parse_scheduler(scenario)
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
+    if "data_type" not in config:
+        raise Exception("Data type(dummy or real) must be specified in config")
     input_type = config["data_type"]
     kwargs = {}
     if "pipeline_kwargs" in config:
@@ -451,10 +470,10 @@ def main():
     args = parse_args()
     config = parse_input_config(args.input_config)
 
-    print("Original Model Path: {}".format(args.model_path))
-    print("Task: {}".format(args.task_name))
-    print("Batch Size: {}".format(args.batch_size))
-    print("Scenario: {}".format(args.scenario))
+    _LOGGER.info("Original Model Path: {}".format(args.model_path))
+    _LOGGER.info("Task: {}".format(args.task_name))
+    _LOGGER.info("Batch Size: {}".format(args.batch_size))
+    _LOGGER.info("Scenario: {}".format(args.scenario))
 
     batch_times, total_run_time, num_streams = benchmark_pipeline(
         model_path=args.model_path,
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index 6702d269d7..675226710b 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -22,9 +22,6 @@
 
 _LOGGER = logging.getLogger(__name__)
 
-DEFAULT_STRING_LENGTH = 50
-DEFAULT_IMAGE_SHAPE = (240, 240, 3)
-
 __all__ = [
     "decide_thread_pinning",
     "parse_scheduler",
@@ -33,6 +30,9 @@
     "parse_input_config",
 ]
 
+DEFAULT_STRING_LENGTH = 50
+DEFAULT_IMAGE_SHAPE = (240, 240, 3)
+
 
 def decide_thread_pinning(pinning_mode: str) -> None:
     pinning_mode = pinning_mode.lower()
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index bddad32b55..483aa68410 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -187,10 +187,10 @@ def __init__(
             batch_size=self._batch_size or 1,  # bs=1 for dynamic batch
             num_cores=num_cores,
             input_shapes=input_shapes,
-            num_streams=num_streams,
         )
         if engine_type.lower() == DEEPSPARSE_ENGINE:
             self._engine_args["scheduler"] = scheduler
+            self._engine_args["num_streams"] = num_streams
 
         self.onnx_file_path = self.setup_onnx_file_path()
 
@@ -711,6 +711,7 @@ def create_engine(
         if context is not None and isinstance(context, Context):
             engine_args.pop("num_cores", None)
             engine_args.pop("scheduler", None)
+            engine_args.pop("num_streams", None)
             engine_args["context"] = context
             return MultiModelEngine(
                 model=onnx_file_path,

From b5ec9ae1dce596001a5b4931f52fd5a853323e84 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 1 Aug 2023 15:00:58 -0400
Subject: [PATCH 25/37] PR comments

---
 .../benchmark/benchmark_pipeline.py           | 31 ++++++++--------
 src/deepsparse/benchmark/data_creation.py     | 35 +++++++++----------
 src/deepsparse/benchmark/helpers.py           | 14 +++-----
 3 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 9db297b25b..9af7bc4d54 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -470,10 +470,11 @@ def main():
     args = parse_args()
     config = parse_input_config(args.input_config)
 
-    _LOGGER.info("Original Model Path: {}".format(args.model_path))
-    _LOGGER.info("Task: {}".format(args.task_name))
-    _LOGGER.info("Batch Size: {}".format(args.batch_size))
-    _LOGGER.info("Scenario: {}".format(args.scenario))
+    _LOGGER.info("Original Model Path: %s" % args.model_path)
+    _LOGGER.info("Task: %s" % args.task_name)
+    _LOGGER.info("Batch Size: %d" % args.batch_size)
+    _LOGGER.info("Scenario: %s" % args.scenario)
+    _LOGGER.info("Requested Run Time(sec): %d" % args.time)
 
     batch_times, total_run_time, num_streams = benchmark_pipeline(
         model_path=args.model_path,
@@ -516,30 +517,26 @@ def main():
     # Export results
     export_path = args.export_path
     if export_path:
-        _LOGGER.info("Saving benchmark results to JSON file at {}".format(export_path))
+        _LOGGER.info("Saving benchmark results to JSON file at %s" % export_path)
         with open(export_path, "w") as out:
             json.dump(export_dict, out, indent=2)
 
     # Results summary
-    print("Original Model Path: {}".format(args.model_path))
-    print("Batch Size: {}".format(args.batch_size))
-    print("Scenario: {}".format(args.scenario))
-    print("Iterations: {}".format(int(benchmark_results["iterations"])))
-    print("Total Runtime: {:.4f}".format(total_run_time))
-    print("Throughput (items/sec): {:.4f}".format(benchmark_results["items_per_sec"]))
+    print("Original Model Path: %s" % args.model_path)
+    print("Batch Size: %d" % args.batch_size)
+    print("Scenario: %s" % args.scenario)
+    print("Iterations: %d" % int(benchmark_results["iterations"]))
+    print("Total Runtime: %.4f" % total_run_time)
+    print("Throughput (items/sec): %.4f" % benchmark_results["items_per_sec"])
 
     print("Processing Time Breakdown: ")
     compute_sections = batch_times[0].stages
     for section in compute_sections:
-        print(
-            "     {}: {:.2f}%".format(
-                section, section_stats[section]["total_percentage"]
-            )
-        )
+        print("     %s: %.2f" % (section, section_stats[section]["total_percentage"]))
 
     print("Mean Latency Breakdown (ms/batch): ")
     for section in compute_sections:
-        print("     {}: {:.4f}".format(section, section_stats[section]["mean"]))
+        print("     %s: %.4f" % (section, section_stats[section]["mean"]))
 
 
 if __name__ == "__main__":
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 769cb73404..886d617310 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -101,7 +101,7 @@ def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
         image_shape = config["input_image_shape"]
     else:
         image_shape = DEFAULT_IMAGE_SHAPE
-        _LOGGER.warning("Using default image shape {}".format(image_shape))
+        _LOGGER.warning("Using default image shape %d" % image_shape)
 
     for _ in range(batch_size):
         rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
@@ -121,16 +121,16 @@ def load_image_data(config: Dict, batch_size: int) -> List[str]:
 
 
 def generate_text_data(config: Dict, batch_size: int, avg_word_len=5) -> List[str]:
-    input_data = []
     if "gen_sequence_length" in config:
         string_length = config["gen_sequence_length"]
     else:
         string_length = DEFAULT_STRING_LENGTH
-        _LOGGER.warning("Using default string length {}".format(string_length))
-    for _ in range(batch_size):
-        rand_sentence = generate_sentence(string_length, avg_word_length=avg_word_len)
-        input_data.append(rand_sentence)
+        _LOGGER.warning("Using default string length %d" % string_length)
 
+    input_data = [
+        generate_sentence(string_length, avg_word_length=avg_word_len)
+        for _ in range(batch_size)
+    ]
     return input_data
 
 
@@ -144,13 +144,12 @@ def load_text_data(config: Dict, batch_size: int) -> List[str]:
         max_string_length = config["max_string_length"]
     else:
         max_string_length = -1
-        _LOGGER.warning("Using default max string length {}".format(max_string_length))
+        _LOGGER.warning("Using default max string length %d" % max_string_length)
     input_data = []
     for f_path in input_files:
-        f = open(f_path)
-        text_data = f.read()
-        f.close()
-        input_data.append(text_data[:max_string_length])
+        with open(f_path) as f:
+            text_data = f.read()
+            input_data.append(text_data[:max_string_length])
     return input_data
 
 
@@ -159,7 +158,7 @@ def generate_question_data(config: Dict, avg_word_len=5) -> Tuple[str, str]:
         string_length = config["gen_sequence_length"]
     else:
         string_length = DEFAULT_STRING_LENGTH
-        _LOGGER.warning("Using default string length {}".format(string_length))
+        _LOGGER.warning("Using default string length %d" % string_length)
     question = generate_sentence(string_length, avg_word_length=avg_word_len)
     context = generate_sentence(string_length, avg_word_length=avg_word_len)
     return (question, context)
@@ -169,10 +168,10 @@ def load_question_data(config: Dict) -> Tuple[str, str]:
     path_to_questions = config["question_file"]
     path_to_context = config["context_file"]
 
-    f_question = open(path_to_questions)
-    f_context = open(path_to_context)
-    question = f_question.read()
-    context = f_context.read()
-    f_question.close()
-    f_context.close()
+    question = ""
+    context = ""
+    with open(path_to_questions) as f:
+        question = f.read()
+    with open(path_to_context) as f:
+        context = f.read()
     return question, context
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index 675226710b..301e834480 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -49,9 +49,7 @@ def decide_thread_pinning(pinning_mode: str) -> None:
         _LOGGER.info("Thread pinning disabled, performance may be sub-optimal")
     else:
         _LOGGER.info(
-            "Recieved invalid option for thread_pinning '{}', skipping".format(
-                pinning_mode
-            )
+            "Recieved invalid option for thread_pinning '%s', skipping" % pinning_mode
         )
 
 
@@ -77,9 +75,7 @@ def parse_scenario(scenario: str) -> str:
         return "elastic"
     else:
         _LOGGER.info(
-            "Recieved invalid option for scenario'{}', defaulting to async".format(
-                scenario
-            )
+            "Recieved invalid option for scenario'%s', defaulting to async" % scenario
         )
         return "multistream"
 
@@ -98,10 +94,8 @@ def parse_num_streams(num_streams: int, num_cores: int, scenario: str):
         else:
             default_num_streams = max(1, int(num_cores / 2))
             _LOGGER.info(
-                "num_streams default value chosen of {}. "
-                "This requires tuning and may be sub-optimal".format(
-                    default_num_streams
-                )
+                "num_streams default value chosen of %d. "
+                "This requires tuning and may be sub-optimal" % default_num_streams
             )
             return default_num_streams
 

From 99b4051efe5896590eb971e24a9bcd24ff341958 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 1 Aug 2023 16:59:54 -0400
Subject: [PATCH 26/37] BaseModel for pipeline config

---
 .../benchmark/benchmark_pipeline.py           | 48 +++++------
 src/deepsparse/benchmark/config.py            | 85 +++++++++++++++++++
 src/deepsparse/benchmark/data_creation.py     | 62 +++++++++-----
 src/deepsparse/benchmark/helpers.py           |  8 +-
 4 files changed, 155 insertions(+), 48 deletions(-)
 create mode 100644 src/deepsparse/benchmark/config.py

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 9af7bc4d54..748f7b2545 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -90,8 +90,10 @@
 from typing import Dict, List, Tuple
 
 import numpy
+from pydantic import BaseModel
 
 from deepsparse import Pipeline, __version__
+from deepsparse.benchmark.config import PipelineBenchmarkConfig, PipelineInputType
 from deepsparse.benchmark.data_creation import (
     SchemaType,
     generate_image_data,
@@ -123,11 +125,6 @@
 ORT_ENGINE = "onnxruntime"
 
 
-class PipelineInputType:
-    DUMMY: str = "dummy"
-    REAL: str = "real"
-
-
 def parse_args():
     parser = argparse.ArgumentParser(description="Benchmark DeepSparse Pipelines")
     parser.add_argument("task_name", type=str, help="Type of pipeline to run")
@@ -313,12 +310,13 @@ def multistream_benchmark(
 
 
 def create_input_schema(
-    pipeline: Pipeline, input_type: PipelineInputType, batch_size: int, config: Dict
-) -> any:
+    pipeline: Pipeline,
+    input_type: PipelineInputType,
+    batch_size: int,
+    config: PipelineBenchmarkConfig,
+) -> BaseModel:
     input_schema_requirement = get_input_schema_type(pipeline)
-    kwargs = {}
-    if "input_schema_kwargs" in config:
-        kwargs = config["input_schema_kwargs"]
+    kwargs = config.input_schema_kwargs
 
     if input_type == PipelineInputType.DUMMY:
         if input_schema_requirement == SchemaType.IMAGE:
@@ -331,9 +329,10 @@ def create_input_schema(
             input_data = generate_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.QUESTION:
-            _LOGGER.warn(
-                "Only batch size of 1 supported for Question Answering Pipeline"
-            )
+            if batch_size != 1:
+                _LOGGER.warning(
+                    "Only batch size of 1 supported for Question Answering Pipeline"
+                )
             question, context = generate_question_data(config)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     elif input_type == PipelineInputType.REAL:
@@ -341,15 +340,16 @@ def create_input_schema(
             input_data = load_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.TEXT_SEQ:
-            input_data = load_text_data(config)
+            input_data = load_text_data(config, batch_size)
             inputs = pipeline.input_schema(sequences=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.TEXT_INPUT:
             input_data = load_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.QUESTION:
-            _LOGGER.warn(
-                "Only batch size of 1 supported for Question Answering Pipeline"
-            )
+            if batch_size != 1:
+                _LOGGER.warning(
+                    "Only batch size of 1 supported for Question Answering Pipeline"
+                )
             question, context = load_question_data(config)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     else:
@@ -361,7 +361,7 @@ def create_input_schema(
 def benchmark_pipeline(
     model_path: str,
     task: str,
-    config: Dict,
+    config: PipelineBenchmarkConfig,
     batch_size: int = 1,
     num_cores: int = None,
     scenario: str = "sync",
@@ -384,12 +384,8 @@ def benchmark_pipeline(
     scheduler = parse_scheduler(scenario)
     num_streams = parse_num_streams(num_streams, num_cores, scenario)
 
-    if "data_type" not in config:
-        raise Exception("Data type(dummy or real) must be specified in config")
-    input_type = config["data_type"]
-    kwargs = {}
-    if "pipeline_kwargs" in config:
-        kwargs = config["pipeline_kwargs"]
+    input_type = config.data_type
+    kwargs = config.pipeline_kwargs
     pipeline = Pipeline.create(
         task=task,
         model_path=model_path,
@@ -510,7 +506,7 @@ def main():
         "scenario": args.scenario,
         "seconds_to_run": time,
         "num_streams": args.num_streams,
-        "input_config": config,
+        "input_config": dict(config),
         "benchmark_results": benchmark_results,
     }
 
@@ -532,7 +528,7 @@ def main():
     print("Processing Time Breakdown: ")
     compute_sections = batch_times[0].stages
     for section in compute_sections:
-        print("     %s: %.2f" % (section, section_stats[section]["total_percentage"]))
+        print("     %s: %.2f%%" % (section, section_stats[section]["total_percentage"]))
 
     print("Mean Latency Breakdown (ms/batch): ")
     for section in compute_sections:
diff --git a/src/deepsparse/benchmark/config.py b/src/deepsparse/benchmark/config.py
new file mode 100644
index 0000000000..6829116758
--- /dev/null
+++ b/src/deepsparse/benchmark/config.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+__all__ = ["PipelineInputType", "PipelineBenchmarkConfig"]
+
+
+class PipelineInputType:
+    DUMMY: str = "dummy"
+    REAL: str = "real"
+
+
+class PipelineBenchmarkConfig(BaseModel):
+    data_type: str = Field(
+        default=PipelineInputType.DUMMY,
+        description=(
+            "Type of data source, dummy to generate data or real to load from file."
+        ),
+    )
+
+    gen_sequence_length: Optional[int] = Field(
+        default=None,
+        description=(
+            "Number of characters to generate for pipelines that take text input."
+        ),
+    )
+
+    input_image_shape: Optional[List[int]] = Field(
+        default=None,
+        description=(
+            "Image size for pipelines that take image input, 3-dim with channel as the "
+            "last dimmension"
+        ),
+    )
+
+    data_folder: Optional[str] = Field(
+        default=None,
+        description=(
+            "Path to local folder of input data containing text or image files"
+        ),
+    )
+
+    recursive_search: bool = Field(
+        default=False,
+        description=("whether to recursively search through data_folder for files"),
+    )
+
+    max_string_length: int = Field(
+        default=-1,
+        description=(
+            "Maximum characters to read from each text file, -1 for no maximum"
+        ),
+    )
+
+    question_file: Optional[str] = Field(
+        default=None, description=("Path to text file to read question from")
+    )
+
+    context_file: Optional[str] = Field(
+        default=None, description=("Path to text file to read question context from")
+    )
+
+    pipeline_kwargs: Dict = Field(
+        default={}, description=("Additional arguments passed to pipeline creation")
+    )
+
+    input_schema_kwargs: Dict = Field(
+        default={},
+        description=("Additional arguments passed to input schema creations "),
+    )
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 886d617310..8407c8cf0f 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -16,11 +16,13 @@
 import logging
 import random
 import string
+from os import path
 from typing import Dict, List, Tuple
 
 import numpy
 
 from deepsparse import Pipeline
+from deepsparse.benchmark.config import PipelineBenchmarkConfig
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -75,6 +77,8 @@ def get_input_schema_type(pipeline: Pipeline) -> str:
 def get_files_with_endings(
     folder: str, num_files: int, recursive: bool, file_endings: Tuple[str]
 ) -> List[str]:
+    if not path.exists(folder):
+        raise Exception("Can't parse files, {} does not exist".format(folder))
     files = []
     for f in glob.glob(folder + "/**", recursive=recursive):
         if f.lower().endswith(file_endings):
@@ -95,13 +99,17 @@ def generate_sentence(string_length: int, avg_word_length: int = 5):
     return "".join(random_chars)
 
 
-def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
+def generate_image_data(
+    config: PipelineBenchmarkConfig, batch_size: int
+) -> List[numpy.ndarray]:
     input_data = []
-    if "input_image_shape" in config and len(config["input_image_shape"]) == 3:
-        image_shape = config["input_image_shape"]
+    if config.input_image_shape and len(config.input_image_shape) == 3:
+        image_shape = config.input_image_shape
     else:
         image_shape = DEFAULT_IMAGE_SHAPE
-        _LOGGER.warning("Using default image shape %d" % image_shape)
+        _LOGGER.warning(
+            f"Could not parse {config.input_image_shape}, Using default image shape {image_shape}"
+        )
 
     for _ in range(batch_size):
         rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
@@ -112,20 +120,24 @@ def generate_image_data(config: Dict, batch_size: int) -> List[numpy.ndarray]:
     return input_data
 
 
-def load_image_data(config: Dict, batch_size: int) -> List[str]:
-    path_to_data = config["data_folder"]
-    recursive_search = config["recursive_search"]
+def load_image_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[str]:
+    if not config.data_folder:
+        raise Exception("Data folder must be defined for real inputs")
+    path_to_data = config.data_folder
+    recursive_search = config.recursive_search
     return get_files_with_endings(
         path_to_data, batch_size, recursive_search, (".jpg", ".jpeg", ".gif")
     )
 
 
-def generate_text_data(config: Dict, batch_size: int, avg_word_len=5) -> List[str]:
-    if "gen_sequence_length" in config:
-        string_length = config["gen_sequence_length"]
+def generate_text_data(
+    config: PipelineBenchmarkConfig, batch_size: int, avg_word_len=5
+) -> List[str]:
+    if config.gen_sequence_length:
+        string_length = config.gen_sequence_length
     else:
         string_length = DEFAULT_STRING_LENGTH
-        _LOGGER.warning("Using default string length %d" % string_length)
+        _LOGGER.warning("Ssing default string length %d" % string_length)
 
     input_data = [
         generate_sentence(string_length, avg_word_length=avg_word_len)
@@ -134,14 +146,16 @@ def generate_text_data(config: Dict, batch_size: int, avg_word_len=5) -> List[st
     return input_data
 
 
-def load_text_data(config: Dict, batch_size: int) -> List[str]:
-    path_to_data = config["data_folder"]
-    recursive_search = config["recursive_search"]
+def load_text_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[str]:
+    if not config.data_folder:
+        raise Exception("Data folder must be defined for real inputs")
+    path_to_data = config.data_folder
+    recursive_search = config.recursive_search
     input_files = get_files_with_endings(
         path_to_data, batch_size, recursive_search, (".txt")
     )
-    if "max_string_length" in config:
-        max_string_length = config["max_string_length"]
+    if config.max_string_length:
+        max_string_length = config.max_string_length
     else:
         max_string_length = -1
         _LOGGER.warning("Using default max string length %d" % max_string_length)
@@ -153,9 +167,11 @@ def load_text_data(config: Dict, batch_size: int) -> List[str]:
     return input_data
 
 
-def generate_question_data(config: Dict, avg_word_len=5) -> Tuple[str, str]:
-    if "gen_sequence_length" in config:
-        string_length = config["gen_sequence_length"]
+def generate_question_data(
+    config: PipelineBenchmarkConfig, avg_word_len=5
+) -> Tuple[str, str]:
+    if config.gen_sequence_length:
+        string_length = config.gen_sequence_length
     else:
         string_length = DEFAULT_STRING_LENGTH
         _LOGGER.warning("Using default string length %d" % string_length)
@@ -165,8 +181,12 @@ def generate_question_data(config: Dict, avg_word_len=5) -> Tuple[str, str]:
 
 
 def load_question_data(config: Dict) -> Tuple[str, str]:
-    path_to_questions = config["question_file"]
-    path_to_context = config["context_file"]
+    if not config.question_file or not config.context_file:
+        raise Exception(
+            "Question and context files must be defined for question_answering pieline"
+        )
+    path_to_questions = config.question_file
+    path_to_context = config.context_file
 
     question = ""
     context = ""
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index 301e834480..ef6b6c92a5 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -17,7 +17,10 @@
 import os
 from typing import Dict
 
+from pydantic import ValidationError
+
 from deepsparse import Scheduler
+from deepsparse.benchmark.config import PipelineBenchmarkConfig
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -104,4 +107,7 @@ def parse_input_config(input_config_file: str) -> Dict[str, any]:
     config_file = open(input_config_file)
     config = json.load(config_file)
     config_file.close()
-    return config
+    try:
+        return PipelineBenchmarkConfig(**config)
+    except ValidationError as e:
+        _LOGGER.error(e)

From 67d8187768d13eaf1212b7b71677baebfa458907 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 1 Aug 2023 17:00:30 -0400
Subject: [PATCH 27/37] quality fix

---
 src/deepsparse/benchmark/data_creation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 8407c8cf0f..dcdbc9d7a5 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -108,7 +108,8 @@ def generate_image_data(
     else:
         image_shape = DEFAULT_IMAGE_SHAPE
         _LOGGER.warning(
-            f"Could not parse {config.input_image_shape}, Using default image shape {image_shape}"
+            f"Could not parse {config.input_image_shape}, "
+            "Using default image shape {image_shape}"
         )
 
     for _ in range(batch_size):

From 8b9768e06f78f8fd87d268e31cd9ca06eb1b6721 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 2 Aug 2023 10:12:32 -0400
Subject: [PATCH 28/37] fix broken test

---
 tests/test_pipeline_benchmark.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index 698e50c927..68a1dcc3b3 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -20,6 +20,7 @@
 import pytest
 from deepsparse import Pipeline
 from deepsparse.benchmark.benchmark_pipeline import calculate_section_stats
+from deepsparse.benchmark.config import PipelineBenchmarkConfig
 from deepsparse.benchmark.data_creation import (
     SchemaType,
     generate_image_data,
@@ -100,7 +101,8 @@ def test_pipeline_benchmark(
 
 def test_generate_image_data():
     batch_size = 32
-    config = {"input_image_shape": (600, 600, 1)}
+    config_args = {"input_image_shape": (600, 600, 1)}
+    config = PipelineBenchmarkConfig(**config_args)
     image_data = generate_image_data(config, batch_size)
     assert len(image_data) == batch_size
     img = image_data[0]
@@ -112,7 +114,8 @@ def test_generate_image_data():
 def test_generate_text_data():
     batch_size = 16
     avg_word_len = 8
-    config = {"gen_sequence_length": 250}
+    config_args = {"gen_sequence_length": 250}
+    config = PipelineBenchmarkConfig(**config_args)
     text_data = generate_text_data(config, batch_size, avg_word_len=avg_word_len)
     assert len(text_data) == batch_size
     text = text_data[0]
@@ -123,10 +126,11 @@ def test_generate_text_data():
 
 def test_generate_question_data():
     avg_word_len = 10
-    config = {"gen_sequence_length": 50}
+    config_args = {"gen_sequence_length": 50}
+    config = PipelineBenchmarkConfig(**config_args)
     question, context = generate_question_data(config, avg_word_len=avg_word_len)
-    assert len(question) == config["gen_sequence_length"]
-    assert len(context) == config["gen_sequence_length"]
+    assert len(question) == config.gen_sequence_length
+    assert len(context) == config.gen_sequence_length
     num_q_spaces = question.count(" ")
     num_c_spaces = context.count(" ")
     assert num_q_spaces == num_c_spaces == int(len(question) / avg_word_len)

From 50d5a74dc219d957786ca85521ed304a11d5b01c Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 2 Aug 2023 16:43:32 -0400
Subject: [PATCH 29/37] cleanup code, replace argpase with click

---
 .../benchmark/benchmark_pipeline.py           | 309 +++++++++---------
 src/deepsparse/benchmark/data_creation.py     |  14 +-
 tests/test_pipeline_benchmark.py              |   3 +-
 3 files changed, 162 insertions(+), 164 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 748f7b2545..6a445f5764 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -81,7 +81,6 @@
     -c config.json -t 30 -s async
 """
 
-import argparse
 import json
 import logging
 import queue
@@ -89,6 +88,7 @@
 import time
 from typing import Dict, List, Tuple
 
+import click
 import numpy
 from pydantic import BaseModel
 
@@ -125,121 +125,6 @@
 ORT_ENGINE = "onnxruntime"
 
 
-def parse_args():
-    parser = argparse.ArgumentParser(description="Benchmark DeepSparse Pipelines")
-    parser.add_argument("task_name", type=str, help="Type of pipeline to run")
-    parser.add_argument(
-        "model_path",
-        type=str,
-        help="Path to an ONNX model file or SparseZoo model stub",
-    )
-    parser.add_argument(
-        "-c",
-        "--input_config",
-        type=str,
-        default="config.json",
-        help="JSON file containing schema for input data",
-    )
-    parser.add_argument(
-        "-b",
-        "--batch_size",
-        type=int,
-        default=1,
-        help="The batch size to run the analysis for. Must be greater than 0",
-    )
-    parser.add_argument(
-        "-ncores",
-        "--num_cores",
-        type=int,
-        default=cpu_architecture().num_available_physical_cores,
-        help=(
-            "The number of physical cores to run the analysis on, "
-            "defaults to all physical cores available on the system"
-        ),
-    )
-    parser.add_argument(
-        "-s",
-        "--scenario",
-        type=str,
-        default="sync",
-        choices=["async", "sync", "elastic"],
-        help=(
-            "Choose between using the async, sync and elastic scenarios. Sync and "
-            "async are similar to the single-stream/multi-stream scenarios. Elastic "
-            "is a newer scenario that behaves similarly to the async scenario "
-            "but uses a different scheduling backend. Default value is sync."
-        ),
-    )
-    parser.add_argument(
-        "-t",
-        "--time",
-        type=int,
-        default=10,
-        help="The number of seconds the benchmark will run. Default is 10 seconds.",
-    )
-    parser.add_argument(
-        "-w",
-        "--warmup_time",
-        type=int,
-        default=2,
-        help=(
-            "The number of seconds the benchmark will warmup before running."
-            "Default is 2 seconds."
-        ),
-    )
-    parser.add_argument(
-        "-nstreams",
-        "--num_streams",
-        type=int,
-        default=None,
-        help=(
-            "The number of streams that will submit inferences in parallel using "
-            "async scenario. Default is automatically determined for given hardware "
-            "and may be sub-optimal."
-        ),
-    )
-    parser.add_argument(
-        "-pin",
-        "--thread_pinning",
-        type=str,
-        default="core",
-        choices=["none", "core", "numa"],
-        help=(
-            "Enable binding threads to cores ('core' the default), "
-            "threads to cores on sockets ('numa'), or disable ('none')"
-        ),
-    )
-    parser.add_argument(
-        "-e",
-        "--engine",
-        type=str,
-        default=DEEPSPARSE_ENGINE,
-        help=(
-            "Inference engine backend to run eval on. Choices are 'deepsparse', "
-            "'onnxruntime'. Default is 'deepsparse'. Can also specify a user "
-            "defined engine class by giving the script and class name in the "
-            "following format <path to python script>:<Engine Class name>. This "
-            "engine class will be dynamically imported during runtime"
-        ),
-    )
-    parser.add_argument(
-        "-q",
-        "--quiet",
-        help="Lower logging verbosity",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "-x",
-        "--export_path",
-        help="Store results into a JSON file",
-        type=str,
-        default=None,
-    )
-
-    return parser.parse_args()
-
-
 class PipelineExecutorThread(threading.Thread):
     """
     Run pipeline reoeatedly on inputs for max_time seconds, pushing the timer data to
@@ -329,11 +214,7 @@ def create_input_schema(
             input_data = generate_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.QUESTION:
-            if batch_size != 1:
-                _LOGGER.warning(
-                    "Only batch size of 1 supported for Question Answering Pipeline"
-                )
-            question, context = generate_question_data(config)
+            question, context = generate_question_data(config, batch_size)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     elif input_type == PipelineInputType.REAL:
         if input_schema_requirement == SchemaType.IMAGE:
@@ -346,11 +227,7 @@ def create_input_schema(
             input_data = load_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.QUESTION:
-            if batch_size != 1:
-                _LOGGER.warning(
-                    "Only batch size of 1 supported for Question Answering Pipeline"
-                )
-            question, context = load_question_data(config)
+            question, context = load_question_data(config, batch_size)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     else:
         raise Exception(f"Unknown input type '{input_type}'")
@@ -462,33 +339,149 @@ def calculate_section_stats(
     return sections
 
 
-def main():
-    args = parse_args()
-    config = parse_input_config(args.input_config)
-
-    _LOGGER.info("Original Model Path: %s" % args.model_path)
-    _LOGGER.info("Task: %s" % args.task_name)
-    _LOGGER.info("Batch Size: %d" % args.batch_size)
-    _LOGGER.info("Scenario: %s" % args.scenario)
-    _LOGGER.info("Requested Run Time(sec): %d" % args.time)
+@click.command()
+@click.argument("task_name", type=str)
+@click.argument("model_path", type=str)
+@click.option(
+    "-c",
+    "--input_config",
+    type=str,
+    default="config.json",
+    help="JSON file containing schema for input data",
+)
+@click.option(
+    "-b",
+    "--batch_size",
+    type=int,
+    default=1,
+    help="The batch size to run the analysis for. Must be greater than 0",
+)
+@click.option(
+    "-ncores",
+    "--num_cores",
+    type=int,
+    default=cpu_architecture().num_available_physical_cores,
+    help=(
+        "The number of physical cores to run the analysis on, "
+        "defaults to all physical cores available on the system"
+    ),
+)
+@click.option(
+    "-s",
+    "--scenario",
+    type=str,
+    default="sync",
+    help=(
+        "Choose between using the async, sync and elastic scenarios. Sync and "
+        "async are similar to the single-stream/multi-stream scenarios. Elastic "
+        "is a newer scenario that behaves similarly to the async scenario "
+        "but uses a different scheduling backend. Default value is sync."
+    ),
+)
+@click.option(
+    "-t",
+    "--run_time",
+    type=int,
+    default=10,
+    help="The number of seconds the benchmark will run. Default is 10 seconds.",
+)
+@click.option(
+    "-w",
+    "--warmup_time",
+    type=int,
+    default=2,
+    help=(
+        "The number of seconds the benchmark will warmup before running."
+        "Default is 2 seconds."
+    ),
+)
+@click.option(
+    "-nstreams",
+    "--num_streams",
+    type=int,
+    default=None,
+    help=(
+        "The number of streams that will submit inferences in parallel using "
+        "async scenario. Default is automatically determined for given hardware "
+        "and may be sub-optimal."
+    ),
+)
+@click.option(
+    "-pin",
+    "--thread_pinning",
+    type=str,
+    default="core",
+    help=(
+        "Enable binding threads to cores ('core' the default), "
+        "threads to cores on sockets ('numa'), or disable ('none')"
+    ),
+)
+@click.option(
+    "-e",
+    "--engine",
+    type=str,
+    default=DEEPSPARSE_ENGINE,
+    help=(
+        "Inference engine backend to run eval on. Choices are 'deepsparse', "
+        "'onnxruntime'. Default is 'deepsparse'. Can also specify a user "
+        "defined engine class by giving the script and class name in the "
+        "following format <path to python script>:<Engine Class name>. This "
+        "engine class will be dynamically imported during runtime"
+    ),
+)
+@click.option(
+    "-q",
+    "--quiet",
+    help="Lower logging verbosity",
+    default=False,
+)
+@click.option(
+    "-x",
+    "--export_path",
+    help="Store results into a JSON file",
+    type=str,
+    default=None,
+)
+def main(
+    task_name: str,
+    model_path: str,
+    input_config: str,
+    batch_size: int,
+    num_cores: int,
+    scenario: str,
+    run_time: int,
+    warmup_time: int,
+    num_streams: int,
+    thread_pinning: str,
+    engine: str,
+    quiet: bool,
+    export_path: str,
+):
+    config = parse_input_config(input_config)
+
+    _LOGGER.info("Original Model Path: %s" % model_path)
+    _LOGGER.info("Task: %s" % task_name)
+    _LOGGER.info("Batch Size: %d" % batch_size)
+    _LOGGER.info("Scenario: %s" % scenario)
+    _LOGGER.info("Requested Run Time(sec): %d" % run_time)
 
     batch_times, total_run_time, num_streams = benchmark_pipeline(
-        model_path=args.model_path,
-        task=args.task_name,
+        model_path=model_path,
+        task=task_name,
         config=config,
-        batch_size=args.batch_size,
-        num_cores=args.num_cores,
-        scenario=args.scenario,
-        seconds_to_run=args.time,
-        warmup_time=args.warmup_time,
-        num_streams=args.num_streams,
-        thread_pinning=args.thread_pinning,
-        engine=args.engine,
-        quiet=args.quiet,
+        batch_size=batch_size,
+        num_cores=num_cores,
+        scenario=scenario,
+        seconds_to_run=run_time,
+        warmup_time=warmup_time,
+        num_streams=num_streams,
+        thread_pinning=thread_pinning,
+        engine=engine,
+        quiet=quiet,
     )
 
     section_stats = calculate_section_stats(batch_times, total_run_time, num_streams)
-    items_per_sec = (len(batch_times) * args.batch_size) / total_run_time
+    items_per_sec = (len(batch_times) * batch_size) / total_run_time
 
     benchmark_results = {
         "items_per_sec": items_per_sec,
@@ -498,29 +491,29 @@ def main():
     }
 
     export_dict = {
-        "engine": args.engine,
+        "engine": engine,
         "version": __version__,
-        "model_path": args.model_path,
-        "batch_size": args.batch_size,
-        "num_cores": args.num_cores,
-        "scenario": args.scenario,
-        "seconds_to_run": time,
-        "num_streams": args.num_streams,
+        "model_path": model_path,
+        "batch_size": batch_size,
+        "num_cores": num_cores,
+        "scenario": scenario,
+        "seconds_to_run": run_time,
+        "num_streams": num_streams,
         "input_config": dict(config),
         "benchmark_results": benchmark_results,
     }
 
     # Export results
-    export_path = args.export_path
+    export_path = export_path
     if export_path:
         _LOGGER.info("Saving benchmark results to JSON file at %s" % export_path)
         with open(export_path, "w") as out:
             json.dump(export_dict, out, indent=2)
 
     # Results summary
-    print("Original Model Path: %s" % args.model_path)
-    print("Batch Size: %d" % args.batch_size)
-    print("Scenario: %s" % args.scenario)
+    print("Original Model Path: %s" % model_path)
+    print("Batch Size: %d" % batch_size)
+    print("Scenario: %s" % scenario)
     print("Iterations: %d" % int(benchmark_results["iterations"]))
     print("Total Runtime: %.4f" % total_run_time)
     print("Throughput (items/sec): %.4f" % benchmark_results["items_per_sec"])
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index dcdbc9d7a5..0f0849280f 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -32,8 +32,6 @@
 
 __all__ = [
     "get_input_schema_type",
-    "get_files_with_endings",
-    "generate_sentence",
     "generate_image_data",
     "load_image_data",
     "generate_text_data",
@@ -169,8 +167,12 @@ def load_text_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[str
 
 
 def generate_question_data(
-    config: PipelineBenchmarkConfig, avg_word_len=5
+    config: PipelineBenchmarkConfig, batch_size: int, avg_word_len=5
 ) -> Tuple[str, str]:
+    if batch_size != 1:
+        _LOGGER.warning(
+            "Only batch size of 1 supported for Question Answering Pipeline"
+        )
     if config.gen_sequence_length:
         string_length = config.gen_sequence_length
     else:
@@ -181,7 +183,11 @@ def generate_question_data(
     return (question, context)
 
 
-def load_question_data(config: Dict) -> Tuple[str, str]:
+def load_question_data(config: Dict, batch_size: int) -> Tuple[str, str]:
+    if batch_size != 1:
+        _LOGGER.warning(
+            "Only batch size of 1 supported for Question Answering Pipeline"
+        )
     if not config.question_file or not config.context_file:
         raise Exception(
             "Question and context files must be defined for question_answering pieline"
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index 68a1dcc3b3..64cf717765 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -128,7 +128,7 @@ def test_generate_question_data():
     avg_word_len = 10
     config_args = {"gen_sequence_length": 50}
     config = PipelineBenchmarkConfig(**config_args)
-    question, context = generate_question_data(config, avg_word_len=avg_word_len)
+    question, context = generate_question_data(config, 1, avg_word_len=avg_word_len)
     assert len(question) == config.gen_sequence_length
     assert len(context) == config.gen_sequence_length
     num_q_spaces = question.count(" ")
@@ -145,7 +145,6 @@ def test_generate_question_data():
         ("question_answering", SchemaType.QUESTION),
     ],
 )
-@pytest.mark.skip(reason="High memory usage, causes GitHub test run to be killed")
 def test_get_input_schema_type(task_name, input_schema):
     pipeline = Pipeline.create(task=task_name)
     assert get_input_schema_type(pipeline) == input_schema

From 70f74406dbebf37eb36c4126b1c414aced4b4674 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 4 Aug 2023 17:53:42 -0400
Subject: [PATCH 30/37] Update README with example output

---
 src/deepsparse/benchmark/README.md | 34 ++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/deepsparse/benchmark/README.md b/src/deepsparse/benchmark/README.md
index 7912abe18c..0088207347 100644
--- a/src/deepsparse/benchmark/README.md
+++ b/src/deepsparse/benchmark/README.md
@@ -266,4 +266,38 @@ deepsparse.benchmark_pipeline image_classification zoo:cv/classification/resnet_
 Running CodeGen text generation for 30 seconds asynchronously 
 ```
 deepsparse.benchmark_pipeline text_generation zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/pruned50-none -c config.json -t 30 -s async
+```
+### Example Output
+Command:
+```
+deepsparse.benchmark_pipeline text_classification zoo:nlp/sentiment_analysis/distilbert-none/pytorch/huggingface/sst2/pruned90-none -c config.json
+```
+config.json:
+```json
+{
+    "data_type": "real",
+    "gen_sequence_length": 1000,
+    "data_folder": "/home/sadkins/text_data/",
+    "recursive_search": true,
+    "max_string_length": -1
+}
+```
+
+Output:
+```
+Batch Size: 1
+Scenario: sync
+Iterations: 955
+Total Runtime: 10.0090
+Throughput (items/sec): 95.4137
+Processing Time Breakdown: 
+     total_inference: 99.49%
+     pre_process: 25.70%
+     engine_forward: 72.56%
+     post_process: 1.03%
+Mean Latency Breakdown (ms/batch): 
+     total_inference: 10.4274
+     pre_process: 2.6938
+     engine_forward: 7.6051
+     post_process: 0.1077
 ```
\ No newline at end of file

From 3afeec744b3f8993c9361ba9458180bbf7c2c004 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 9 Aug 2023 16:02:24 -0400
Subject: [PATCH 31/37] support for multiple timers, adding docstrings

---
 .../benchmark/benchmark_pipeline.py           | 46 +++++++++++++++----
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 6a445f5764..2dbb4898a0 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -149,7 +149,8 @@ def __init__(
     def run(self):
         while time.perf_counter() < self._max_time:
             _ = self._pipeline(self._inputs)
-            self._time_queue.put(self._pipeline.timer_manager.latest)
+            for timer in self._pipeline.timer_manager.timers:
+                self._time_queue.put(timer)
 
 
 def singlestream_benchmark(
@@ -163,7 +164,8 @@ def singlestream_benchmark(
     batch_timings = []
     while time.perf_counter() < benchmark_end_time:
         _ = pipeline(inputs)
-        batch_timings.append(pipeline.timer_manager.latest)
+        for timer in pipeline.timer_manager.timers:
+            batch_timings.append(timer)
 
     return batch_timings
 
@@ -249,6 +251,25 @@ def benchmark_pipeline(
     engine: str = DEEPSPARSE_ENGINE,
     quiet: bool = False,
 ) -> Tuple[List[StagedTimer], float]:
+    """
+    Run a benchmark over the specified pipeline, tracking timings for pre-processing,
+    forward pass and post-processing. Results are printed to the console and optionally
+    exported to a json file.
+
+    :param model_path: path to onnx model
+    :param task: name of pipeline to run
+    :param config: configuration for pipeline inputs
+    :param batch_size: number of inputs to process each forward pass
+    :param num_cores: number of physical cores to run on
+    :param scenario: sync, async or elastic processing
+    :param seconds_to_run: number of seconds to run benchmark for
+    :param warmup_time: length to run pipeline before beginning benchmark
+    :param num_streams: number of parallel streams during async scenario
+    :param thread_pinning: enable binding threads to cores
+    :param engine: inference engine, deepsparse or onnxruntime
+    :param quiet: lower logging verbosity
+    :return: list of StagedTimer objects for each forward pass and the total run time
+    """
 
     if quiet:
         set_logging_level(logging.WARN)
@@ -326,14 +347,20 @@ def calculate_statistics(
 def calculate_section_stats(
     batch_times: List[StagedTimer], total_run_time: float, num_streams: int
 ) -> Dict[str, Dict]:
-    compute_sections = batch_times[0].stages
     total_run_time_ms = total_run_time * 1000
 
+    section_times = {}
+    for timer in batch_times:
+        for section in timer.stages:
+            if section not in section_times:
+                section_times[section] = []
+            section_times[section].append(timer.times[section] * 1000)
+
     sections = {}
-    for section in compute_sections:
-        section_times = [st.times[section] * 1000 for st in batch_times]
-        sections[section] = calculate_statistics(
-            section_times, total_run_time_ms, num_streams
+    for section_name in section_times:
+        times = section_times[section_name]
+        sections[section_name] = calculate_statistics(
+            times, total_run_time_ms, num_streams
         )
 
     return sections
@@ -519,12 +546,11 @@ def main(
     print("Throughput (items/sec): %.4f" % benchmark_results["items_per_sec"])
 
     print("Processing Time Breakdown: ")
-    compute_sections = batch_times[0].stages
-    for section in compute_sections:
+    for section in section_stats:
         print("     %s: %.2f%%" % (section, section_stats[section]["total_percentage"]))
 
     print("Mean Latency Breakdown (ms/batch): ")
-    for section in compute_sections:
+    for section in section_stats:
         print("     %s: %.4f" % (section, section_stats[section]["mean"]))
 
 

From df9a3f7bd752c48106ac5dff2ab22085e819fe86 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 9 Aug 2023 16:05:05 -0400
Subject: [PATCH 32/37] docstrings

---
 src/deepsparse/benchmark/benchmark_pipeline.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 2dbb4898a0..e99c513bf1 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -159,6 +159,11 @@ def singlestream_benchmark(
     """
     Run pipeline repeatedly on inputs for max_time seconds, storing the runtime of each
     section of the pipeline in batch_timings
+
+    :param pipeline: pipeline to execute
+    :param inputs: inputs to pass through pipeline
+    :param seconds_to_run: how long to run pipeline for
+    :return: list of timings for each forward pass
     """
     benchmark_end_time = time.perf_counter() + seconds_to_run
     batch_timings = []

From b0bc84066cc6148bf099cead201f72defa1c0c22 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 10 Aug 2023 11:14:56 -0400
Subject: [PATCH 33/37] add text generation example to README

---
 src/deepsparse/benchmark/README.md | 41 ++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/src/deepsparse/benchmark/README.md b/src/deepsparse/benchmark/README.md
index 0088207347..5c23ff5c14 100644
--- a/src/deepsparse/benchmark/README.md
+++ b/src/deepsparse/benchmark/README.md
@@ -300,4 +300,45 @@ Mean Latency Breakdown (ms/batch):
      pre_process: 2.6938
      engine_forward: 7.6051
      post_process: 0.1077
+```
+
+Command:
+```
+deepsparse.benchmark_pipeline text_generation zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base_quant-none -c config.json -t 60
+```
+config.json:
+```json
+{
+    "data_type": "dummy",
+    "gen_sequence_length": 100,
+    "pipeline_kwargs": {},
+    "input_schema_kwargs": {}
+} 
+```
+
+Output:
+```
+Batch Size: 1
+Scenario: sync
+Iterations: 6
+Total Runtime: 62.8005
+Throughput (items/sec): 0.0955
+Processing Time Breakdown: 
+     total_inference: 100.00%
+     pre_process: 0.00%
+     engine_forward: 99.98%
+     post_process: 0.01%
+     engine_prompt_prefill: 5.83%
+     engine_prompt_prefill_single: 0.09%
+     engine_token_generation: 93.64%
+     engine_token_generation_single: 0.09%
+Mean Latency Breakdown (ms/batch): 
+     total_inference: 20932.4786
+     pre_process: 0.9729
+     engine_forward: 20930.2190
+     post_process: 1.2150
+     engine_prompt_prefill: 1220.7037
+     engine_prompt_prefill_single: 19.0412
+     engine_token_generation: 19603.0353
+     engine_token_generation_single: 19.1170
 ```
\ No newline at end of file

From eba70d6d56dbc00e4b42c841e520c75000194889 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 10 Aug 2023 14:50:00 -0400
Subject: [PATCH 34/37] clean up timermanager usage

---
 .../benchmark/benchmark_pipeline.py           | 68 +++++++------------
 src/deepsparse/utils/timer.py                 |  4 ++
 tests/test_pipeline_benchmark.py              |  7 +-
 3 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index e99c513bf1..697d24b947 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -83,7 +83,6 @@
 
 import json
 import logging
-import queue
 import threading
 import time
 from typing import Dict, List, Tuple
@@ -113,7 +112,6 @@
 )
 from deepsparse.cpu import cpu_architecture
 from deepsparse.log import set_logging_level
-from deepsparse.utils.timer import StagedTimer
 
 
 __all__ = ["benchmark_pipeline"]
@@ -127,8 +125,8 @@
 
 class PipelineExecutorThread(threading.Thread):
     """
-    Run pipeline reoeatedly on inputs for max_time seconds, pushing the timer data to
-    the timer queue to store the runtime of each section of the pipeline.
+    Run pipeline reoeatedly on inputs for max_time seconds, storing the runtime of each
+    section of the pipeline in its timer manager
 
     For intended usage, see multistream_benchmark
     """
@@ -137,42 +135,32 @@ def __init__(
         self,
         pipeline: Pipeline,
         inputs: List[any],
-        time_queue: queue.Queue,
         max_time: float,
     ):
         super(PipelineExecutorThread, self).__init__()
         self._pipeline = pipeline
         self._inputs = inputs
-        self._time_queue = time_queue
         self._max_time = max_time
 
     def run(self):
         while time.perf_counter() < self._max_time:
             _ = self._pipeline(self._inputs)
-            for timer in self._pipeline.timer_manager.timers:
-                self._time_queue.put(timer)
 
 
 def singlestream_benchmark(
     pipeline: Pipeline, inputs: List[any], seconds_to_run: float
-) -> List[StagedTimer]:
+):
     """
     Run pipeline repeatedly on inputs for max_time seconds, storing the runtime of each
-    section of the pipeline in batch_timings
+    section of the pipeline in its timer manager
 
     :param pipeline: pipeline to execute
     :param inputs: inputs to pass through pipeline
     :param seconds_to_run: how long to run pipeline for
-    :return: list of timings for each forward pass
     """
     benchmark_end_time = time.perf_counter() + seconds_to_run
-    batch_timings = []
     while time.perf_counter() < benchmark_end_time:
         _ = pipeline(inputs)
-        for timer in pipeline.timer_manager.timers:
-            batch_timings.append(timer)
-
-    return batch_timings
 
 
 def multistream_benchmark(
@@ -180,17 +168,21 @@ def multistream_benchmark(
     inputs: List[any],
     seconds_to_run: float,
     num_streams: int,
-) -> List[StagedTimer]:
+):
     """
     Create num_streams threads, each of which calls PipelineExecutorThread.run() for
-    seconds_to_run seconds. Stores all timing info in a shared queue.
+    seconds_to_run seconds. All timing info stored in pipeline.timer_manager
+
+    :param pipeline: pipeline to execute
+    :param inputs: inputs to pass through pipeline
+    :param seconds_to_run: how long to run pipeline for
+    :param num_streams: number of threads to launch
     """
-    time_queue = queue.Queue()
     max_time = time.perf_counter() + seconds_to_run
     threads = []
 
     for thread in range(num_streams):
-        threads.append(PipelineExecutorThread(pipeline, inputs, time_queue, max_time))
+        threads.append(PipelineExecutorThread(pipeline, inputs, max_time))
 
     for thread in threads:
         thread.start()  # triggers PipelineExecutorThread.run()
@@ -198,8 +190,6 @@ def multistream_benchmark(
     for thread in threads:
         thread.join()
 
-    return list(time_queue.queue)
-
 
 def create_input_schema(
     pipeline: Pipeline,
@@ -255,7 +245,7 @@ def benchmark_pipeline(
     thread_pinning: str = "core",
     engine: str = DEEPSPARSE_ENGINE,
     quiet: bool = False,
-) -> Tuple[List[StagedTimer], float]:
+) -> Tuple[Dict[str, List[float]], float]:
     """
     Run a benchmark over the specified pipeline, tracking timings for pre-processing,
     forward pass and post-processing. Results are printed to the console and optionally
@@ -273,7 +263,7 @@ def benchmark_pipeline(
     :param thread_pinning: enable binding threads to cores
     :param engine: inference engine, deepsparse or onnxruntime
     :param quiet: lower logging verbosity
-    :return: list of StagedTimer objects for each forward pass and the total run time
+    :return: dictionary of section times for each forward pass and the total run time
     """
 
     if quiet:
@@ -289,6 +279,7 @@ def benchmark_pipeline(
 
     input_type = config.data_type
     kwargs = config.pipeline_kwargs
+    kwargs["benchmark"] = True
     pipeline = Pipeline.create(
         task=task,
         model_path=model_path,
@@ -302,25 +293,25 @@ def benchmark_pipeline(
 
     if scenario == "singlestream":
         singlestream_benchmark(pipeline, inputs, warmup_time)
+        pipeline.timer_manager.clear()
         start_time = time.perf_counter()
-        batch_times = singlestream_benchmark(pipeline, inputs, seconds_to_run)
+        singlestream_benchmark(pipeline, inputs, seconds_to_run)
     elif scenario == "multistream":
         multistream_benchmark(pipeline, inputs, warmup_time, num_streams)
+        pipeline.timer_manager.clear()
         start_time = time.perf_counter()
-        batch_times = multistream_benchmark(
-            pipeline, inputs, seconds_to_run, num_streams
-        )
+        multistream_benchmark(pipeline, inputs, seconds_to_run, num_streams)
     elif scenario == "elastic":
         multistream_benchmark(pipeline, inputs, warmup_time, num_streams)
+        pipeline.timer_manager.clear()
         start_time = time.perf_counter()
-        batch_times = multistream_benchmark(
-            pipeline, inputs, seconds_to_run, num_streams
-        )
+        multistream_benchmark(pipeline, inputs, seconds_to_run, num_streams)
     else:
         raise Exception(f"Unknown scenario '{scenario}'")
 
     end_time = time.perf_counter()
     total_run_time = end_time - start_time
+    batch_times = pipeline.timer_manager.all_times
     if len(batch_times) == 0:
         raise Exception(
             "Generated no batch timings, try extending benchmark time with '--time'"
@@ -350,20 +341,13 @@ def calculate_statistics(
 
 
 def calculate_section_stats(
-    batch_times: List[StagedTimer], total_run_time: float, num_streams: int
+    batch_times: Dict[str, List[float]], total_run_time: float, num_streams: int
 ) -> Dict[str, Dict]:
     total_run_time_ms = total_run_time * 1000
 
-    section_times = {}
-    for timer in batch_times:
-        for section in timer.stages:
-            if section not in section_times:
-                section_times[section] = []
-            section_times[section].append(timer.times[section] * 1000)
-
     sections = {}
-    for section_name in section_times:
-        times = section_times[section_name]
+    for section_name in batch_times:
+        times = [t * 1000 for t in batch_times[section_name]]
         sections[section_name] = calculate_statistics(
             times, total_run_time_ms, num_streams
         )
@@ -513,7 +497,7 @@ def main(
     )
 
     section_stats = calculate_section_stats(batch_times, total_run_time, num_streams)
-    items_per_sec = (len(batch_times) * batch_size) / total_run_time
+    items_per_sec = (len(batch_times["total_inference"]) * batch_size) / total_run_time
 
     benchmark_results = {
         "items_per_sec": items_per_sec,
diff --git a/src/deepsparse/utils/timer.py b/src/deepsparse/utils/timer.py
index 1dcaf77acf..56a3452b6e 100644
--- a/src/deepsparse/utils/timer.py
+++ b/src/deepsparse/utils/timer.py
@@ -338,6 +338,10 @@ def all_times(self) -> Dict[str, List[float]]:
 
         return all_times
 
+    def clear(self):
+        for t in self._timers:
+            t.clear()
+
     @contextmanager
     def new_timer_context(self, total_inference: bool = True) -> StagedTimer:
         """
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index 64cf717765..d083beed5a 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -28,7 +28,7 @@
     generate_text_data,
     get_input_schema_type,
 )
-from deepsparse.utils import StagedTimer
+from deepsparse.utils import StagedTimer, TimerManager
 from tests.helpers import run_command
 
 
@@ -97,6 +97,7 @@ def test_pipeline_benchmark(
     assert res.returncode == 0
     assert "error" not in res.stdout.lower()
     assert "fail" not in res.stdout.lower()
+    assert "total_inference" in res.stdout.lower()
 
 
 def test_generate_image_data():
@@ -152,6 +153,7 @@ def test_get_input_schema_type(task_name, input_schema):
 
 def test_calculations():
     batch_times = []
+    timer_manager = TimerManager()
     for i in range(5):
         timer = StagedTimer()
         timer._staged_start_times["stage_1"] = [i + 0.1]
@@ -160,8 +162,9 @@ def test_calculations():
         timer._staged_start_times["stage_2"] = [i + 0.6]
         timer._staged_stop_times["stage_2"] = [i + 0.9]
 
-        batch_times.append(timer)
+        timer_manager._timers.append(timer)
 
+    batch_times = timer_manager.all_times
     total_run_time = 6.0
     section_stats = calculate_section_stats(batch_times, total_run_time, 1)
     assert math.isclose(

From 1eb3202e0afa16fdde10e6449147da1fc6f89e4e Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 15 Aug 2023 12:41:55 -0400
Subject: [PATCH 35/37] PR comments

---
 .../benchmark/benchmark_pipeline.py           | 14 ++++-----
 src/deepsparse/benchmark/data_creation.py     | 29 ++++++++-----------
 src/deepsparse/benchmark/helpers.py           | 29 +++++++++++++++----
 tests/test_pipeline_benchmark.py              | 18 ++++++------
 4 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 697d24b947..88283a7dec 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -95,9 +95,9 @@
 from deepsparse.benchmark.config import PipelineBenchmarkConfig, PipelineInputType
 from deepsparse.benchmark.data_creation import (
     SchemaType,
-    generate_image_data,
-    generate_question_data,
-    generate_text_data,
+    generate_random_image_data,
+    generate_random_question_data,
+    generate_random_text_data,
     get_input_schema_type,
     load_image_data,
     load_question_data,
@@ -202,16 +202,16 @@ def create_input_schema(
 
     if input_type == PipelineInputType.DUMMY:
         if input_schema_requirement == SchemaType.IMAGE:
-            input_data = generate_image_data(config, batch_size)
+            input_data = generate_random_image_data(config, batch_size)
             inputs = pipeline.input_schema(images=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.TEXT_SEQ:
-            input_data = generate_text_data(config, batch_size)
+            input_data = generate_random_text_data(config, batch_size)
             inputs = pipeline.input_schema(sequences=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.TEXT_INPUT:
-            input_data = generate_text_data(config, batch_size)
+            input_data = generate_random_text_data(config, batch_size)
             inputs = pipeline.input_schema(inputs=input_data, **kwargs)
         elif input_schema_requirement == SchemaType.QUESTION:
-            question, context = generate_question_data(config, batch_size)
+            question, context = generate_random_question_data(config, batch_size)
             inputs = pipeline.input_schema(question=question, context=context, **kwargs)
     elif input_type == PipelineInputType.REAL:
         if input_schema_requirement == SchemaType.IMAGE:
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 0f0849280f..7c6c378b1b 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -32,11 +32,11 @@
 
 __all__ = [
     "get_input_schema_type",
-    "generate_image_data",
+    "generate_random_image_data",
     "load_image_data",
-    "generate_text_data",
+    "generate_random_text_data",
     "load_text_data",
-    "generate_question_data",
+    "generate_random_question_data",
     "load_question_data",
 ]
 
@@ -86,7 +86,7 @@ def get_files_with_endings(
     return random.sample(files, num_files)
 
 
-def generate_sentence(string_length: int, avg_word_length: int = 5):
+def generate_random_sentence(string_length: int, avg_word_length: int = 5):
     random_chars = "".join(random.choices(string.ascii_letters, k=string_length))
     space_locations = random.sample(
         range(string_length), int(string_length / avg_word_length)
@@ -97,7 +97,7 @@ def generate_sentence(string_length: int, avg_word_length: int = 5):
     return "".join(random_chars)
 
 
-def generate_image_data(
+def generate_random_image_data(
     config: PipelineBenchmarkConfig, batch_size: int
 ) -> List[numpy.ndarray]:
     input_data = []
@@ -107,15 +107,10 @@ def generate_image_data(
         image_shape = DEFAULT_IMAGE_SHAPE
         _LOGGER.warning(
             f"Could not parse {config.input_image_shape}, "
-            "Using default image shape {image_shape}"
+            f"Using default image shape {image_shape}"
         )
 
-    for _ in range(batch_size):
-        rand_array = numpy.random.randint(0, high=255, size=image_shape).astype(
-            numpy.uint8
-        )
-        input_data.append(rand_array)
-
+    input_data = [numpy.random.randint(0, high=255, size=image_shape).astype(numpy.uint8) for _ in range(batch_size)]
     return input_data
 
 
@@ -129,7 +124,7 @@ def load_image_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[st
     )
 
 
-def generate_text_data(
+def generate_random_text_data(
     config: PipelineBenchmarkConfig, batch_size: int, avg_word_len=5
 ) -> List[str]:
     if config.gen_sequence_length:
@@ -139,7 +134,7 @@ def generate_text_data(
         _LOGGER.warning("Ssing default string length %d" % string_length)
 
     input_data = [
-        generate_sentence(string_length, avg_word_length=avg_word_len)
+        generate_random_sentence(string_length, avg_word_length=avg_word_len)
         for _ in range(batch_size)
     ]
     return input_data
@@ -166,7 +161,7 @@ def load_text_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[str
     return input_data
 
 
-def generate_question_data(
+def generate_random_question_data(
     config: PipelineBenchmarkConfig, batch_size: int, avg_word_len=5
 ) -> Tuple[str, str]:
     if batch_size != 1:
@@ -178,8 +173,8 @@ def generate_question_data(
     else:
         string_length = DEFAULT_STRING_LENGTH
         _LOGGER.warning("Using default string length %d" % string_length)
-    question = generate_sentence(string_length, avg_word_length=avg_word_len)
-    context = generate_sentence(string_length, avg_word_length=avg_word_len)
+    question = generate_random_sentence(string_length, avg_word_length=avg_word_len)
+    context = generate_random_sentence(string_length, avg_word_length=avg_word_len)
     return (question, context)
 
 
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index ef6b6c92a5..baa36b47ba 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -36,17 +36,29 @@
 DEFAULT_STRING_LENGTH = 50
 DEFAULT_IMAGE_SHAPE = (240, 240, 3)
 
+class ThreadPinningMode:
+    CORE: str = "core"
+    NUMA: str = "numa"
+    NONE: str = "none"
+
 
 def decide_thread_pinning(pinning_mode: str) -> None:
+    """
+    Enable binding threads to cores ('core' the default), threads to cores on sockets
+    ('numa'), or disable ('none')"
+
+    :param pinning_mode: thread pinning mode to use
+    :return: None
+    """
     pinning_mode = pinning_mode.lower()
-    if pinning_mode in "core":
+    if pinning_mode == ThreadPinningMode.CORE:
         os.environ["NM_BIND_THREADS_TO_CORES"] = "1"
         _LOGGER.info("Thread pinning to cores enabled")
-    elif pinning_mode in "numa":
+    elif pinning_mode == ThreadPinningMode.NUMA:
         os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
         os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "1"
         _LOGGER.info("Thread pinning to socket/numa nodes enabled")
-    elif pinning_mode in "none":
+    elif pinning_mode in ThreadPinningMode.NONE:
         os.environ["NM_BIND_THREADS_TO_CORES"] = "0"
         os.environ["NM_BIND_THREADS_TO_SOCKETS"] = "0"
         _LOGGER.info("Thread pinning disabled, performance may be sub-optimal")
@@ -57,6 +69,12 @@ def decide_thread_pinning(pinning_mode: str) -> None:
 
 
 def parse_scheduler(scenario: str) -> Scheduler:
+    """
+    Returns a threading scheduler based on desired scenario
+
+    :param scenario: scheduling scenario to use
+    :return: scehduler with desred scenario
+    """
     scenario = scenario.lower()
     if scenario == "multistream":
         return Scheduler.multi_stream
@@ -77,7 +95,7 @@ def parse_scenario(scenario: str) -> str:
     elif scenario == "elastic":
         return "elastic"
     else:
-        _LOGGER.info(
+        _LOGGER.warning(
             "Recieved invalid option for scenario'%s', defaulting to async" % scenario
         )
         return "multistream"
@@ -96,7 +114,7 @@ def parse_num_streams(num_streams: int, num_cores: int, scenario: str):
             return num_streams
         else:
             default_num_streams = max(1, int(num_cores / 2))
-            _LOGGER.info(
+            _LOGGER.warning(
                 "num_streams default value chosen of %d. "
                 "This requires tuning and may be sub-optimal" % default_num_streams
             )
@@ -111,3 +129,4 @@ def parse_input_config(input_config_file: str) -> Dict[str, any]:
         return PipelineBenchmarkConfig(**config)
     except ValidationError as e:
         _LOGGER.error(e)
+        raise e
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index d083beed5a..ad685e9c77 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -23,9 +23,9 @@
 from deepsparse.benchmark.config import PipelineBenchmarkConfig
 from deepsparse.benchmark.data_creation import (
     SchemaType,
-    generate_image_data,
-    generate_question_data,
-    generate_text_data,
+    generate_random_image_data,
+    generate_random_question_data,
+    generate_random_text_data,
     get_input_schema_type,
 )
 from deepsparse.utils import StagedTimer, TimerManager
@@ -100,11 +100,11 @@ def test_pipeline_benchmark(
     assert "total_inference" in res.stdout.lower()
 
 
-def test_generate_image_data():
+def test_generate_random_image_data():
     batch_size = 32
     config_args = {"input_image_shape": (600, 600, 1)}
     config = PipelineBenchmarkConfig(**config_args)
-    image_data = generate_image_data(config, batch_size)
+    image_data = generate_random_image_data(config, batch_size)
     assert len(image_data) == batch_size
     img = image_data[0]
     assert img.shape == (600, 600, 1)
@@ -112,12 +112,12 @@ def test_generate_image_data():
     assert numpy.max(img) < 255 and numpy.min(img) >= 0
 
 
-def test_generate_text_data():
+def test_generate_random_text_data():
     batch_size = 16
     avg_word_len = 8
     config_args = {"gen_sequence_length": 250}
     config = PipelineBenchmarkConfig(**config_args)
-    text_data = generate_text_data(config, batch_size, avg_word_len=avg_word_len)
+    text_data = generate_random_text_data(config, batch_size, avg_word_len=avg_word_len)
     assert len(text_data) == batch_size
     text = text_data[0]
     assert len(text) == 250
@@ -125,11 +125,11 @@ def test_generate_text_data():
     assert num_spaces == int(len(text) / avg_word_len)
 
 
-def test_generate_question_data():
+def test_generate_random_question_data():
     avg_word_len = 10
     config_args = {"gen_sequence_length": 50}
     config = PipelineBenchmarkConfig(**config_args)
-    question, context = generate_question_data(config, 1, avg_word_len=avg_word_len)
+    question, context = generate_random_question_data(config, 1, avg_word_len=avg_word_len)
     assert len(question) == config.gen_sequence_length
     assert len(context) == config.gen_sequence_length
     num_q_spaces = question.count(" ")

From 289f545b66c98c6b07951e1f2e3dd1d3b842281f Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 15 Aug 2023 12:42:20 -0400
Subject: [PATCH 36/37] style

---
 src/deepsparse/benchmark/data_creation.py | 5 ++++-
 src/deepsparse/benchmark/helpers.py       | 1 +
 tests/test_pipeline_benchmark.py          | 4 +++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 7c6c378b1b..0b6c921b9d 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -110,7 +110,10 @@ def generate_random_image_data(
             f"Using default image shape {image_shape}"
         )
 
-    input_data = [numpy.random.randint(0, high=255, size=image_shape).astype(numpy.uint8) for _ in range(batch_size)]
+    input_data = [
+        numpy.random.randint(0, high=255, size=image_shape).astype(numpy.uint8)
+        for _ in range(batch_size)
+    ]
     return input_data
 
 
diff --git a/src/deepsparse/benchmark/helpers.py b/src/deepsparse/benchmark/helpers.py
index baa36b47ba..703dafa92c 100644
--- a/src/deepsparse/benchmark/helpers.py
+++ b/src/deepsparse/benchmark/helpers.py
@@ -36,6 +36,7 @@
 DEFAULT_STRING_LENGTH = 50
 DEFAULT_IMAGE_SHAPE = (240, 240, 3)
 
+
 class ThreadPinningMode:
     CORE: str = "core"
     NUMA: str = "numa"
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index ad685e9c77..485599d044 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -129,7 +129,9 @@ def test_generate_random_question_data():
     avg_word_len = 10
     config_args = {"gen_sequence_length": 50}
     config = PipelineBenchmarkConfig(**config_args)
-    question, context = generate_random_question_data(config, 1, avg_word_len=avg_word_len)
+    question, context = generate_random_question_data(
+        config, 1, avg_word_len=avg_word_len
+    )
     assert len(question) == config.gen_sequence_length
     assert len(context) == config.gen_sequence_length
     num_q_spaces = question.count(" ")

From 749a7521447435dd0b0af53e1009b419892f07f7 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 15 Aug 2023 12:53:03 -0400
Subject: [PATCH 37/37] PR comments

---
 src/deepsparse/benchmark/benchmark_pipeline.py |  4 +++-
 src/deepsparse/benchmark/data_creation.py      | 16 ++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_pipeline.py b/src/deepsparse/benchmark/benchmark_pipeline.py
index 88283a7dec..373e6257bb 100644
--- a/src/deepsparse/benchmark/benchmark_pipeline.py
+++ b/src/deepsparse/benchmark/benchmark_pipeline.py
@@ -79,6 +79,8 @@
     zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/
     bigpython_bigquery_thepile/pruned50-none \
     -c config.json -t 30 -s async
+
+Refer to README for config.json examples
 """
 
 import json
@@ -125,7 +127,7 @@
 
 class PipelineExecutorThread(threading.Thread):
     """
-    Run pipeline reoeatedly on inputs for max_time seconds, storing the runtime of each
+    Run pipeline repeatedly on inputs for max_time seconds, storing the runtime of each
     section of the pipeline in its timer manager
 
     For intended usage, see multistream_benchmark
diff --git a/src/deepsparse/benchmark/data_creation.py b/src/deepsparse/benchmark/data_creation.py
index 0b6c921b9d..c44a2a455c 100644
--- a/src/deepsparse/benchmark/data_creation.py
+++ b/src/deepsparse/benchmark/data_creation.py
@@ -25,11 +25,6 @@
 from deepsparse.benchmark.config import PipelineBenchmarkConfig
 
 
-_LOGGER = logging.getLogger(__name__)
-
-DEFAULT_STRING_LENGTH = 50
-DEFAULT_IMAGE_SHAPE = (240, 240, 3)
-
 __all__ = [
     "get_input_schema_type",
     "generate_random_image_data",
@@ -40,6 +35,11 @@
     "load_question_data",
 ]
 
+_LOGGER = logging.getLogger(__name__)
+
+DEFAULT_STRING_LENGTH = 50
+DEFAULT_IMAGE_SHAPE = (240, 240, 3)
+
 
 class SchemaType:
     IMAGE: str = "images"
@@ -72,7 +72,7 @@ def get_input_schema_type(pipeline: Pipeline) -> str:
     raise Exception("Unknown schema requirement {}".format(input_schema_requirements))
 
 
-def get_files_with_endings(
+def get_files_with_suffixes(
     folder: str, num_files: int, recursive: bool, file_endings: Tuple[str]
 ) -> List[str]:
     if not path.exists(folder):
@@ -122,7 +122,7 @@ def load_image_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[st
         raise Exception("Data folder must be defined for real inputs")
     path_to_data = config.data_folder
     recursive_search = config.recursive_search
-    return get_files_with_endings(
+    return get_files_with_suffixes(
         path_to_data, batch_size, recursive_search, (".jpg", ".jpeg", ".gif")
     )
 
@@ -148,7 +148,7 @@ def load_text_data(config: PipelineBenchmarkConfig, batch_size: int) -> List[str
         raise Exception("Data folder must be defined for real inputs")
     path_to_data = config.data_folder
     recursive_search = config.recursive_search
-    input_files = get_files_with_endings(
+    input_files = get_files_with_suffixes(
         path_to_data, batch_size, recursive_search, (".txt")
     )
     if config.max_string_length: