neuralmagic · Satrat · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/examples/benchmark/resnet50_benchmark.py b/examples/benchmark/resnet50_benchmark.py
@@ -123,52 +123,52 @@ def main():
     results = benchmark_model(
         (
             "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/"
-            "pruned-conservative"
+            "pruned80_quant-none-vnni"
         ),
         sample_inputs,
         batch_size=batch_size,
         num_cores=num_cores,
         num_iterations=num_iterations,
         num_warmup_iterations=num_warmup_iterations,
     )
-    print(f"ResNet-50 v1 Pruned Conservative FP32 {results}")
+    print(f"ResNet-50 v1 Pruned 80 INT8 {results}")
+
+    if not VNNI:
+        print(
+            "WARNING: VNNI instructions not detected, "
+            "quantization (INT8) speedup not well supported"
+        )
 
     results = benchmark_model(
         (
             "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/"
-            "pruned-moderate"
+            "pruned90-none"
         ),
         sample_inputs,
         batch_size=batch_size,
         num_cores=num_cores,
         num_iterations=num_iterations,
         num_warmup_iterations=num_warmup_iterations,
     )
-    print(f"ResNet-50 v1 Pruned Moderate FP32 {results}")
-
-    if not VNNI:
-        print(
-            "WARNING: VNNI instructions not detected, "
-            "quantization (INT8) speedup not well supported"
-        )
+    print(f"ResNet-50 v1 Pruned 90 FP32 {results}")
 
     results = benchmark_model(
         (
             "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/"
-            "pruned_quant-moderate"
+            "pruned90_quant-none"
         ),
         sample_inputs,
         batch_size=batch_size,
         num_cores=num_cores,
         num_iterations=num_iterations,
         num_warmup_iterations=num_warmup_iterations,
     )
-    print(f"ResNet-50 v1 Pruned Moderate INT8 {results}")
+    print(f"ResNet-50 v1 Pruned 90 INT8 {results}")
 
     results = benchmark_model(
         (
             "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/"
-            "pruned95_quant-none"
+            "pruned95_uniform_quant-none"
         ),
         sample_inputs,
         batch_size=batch_size,

diff --git a/src/deepsparse/benchmark/README.md b/src/deepsparse/benchmark/README.md
@@ -20,10 +20,10 @@ limitations under the License.
 
 ### Quickstart
 
-After `pip install deepsparse`, the benchmark tool is available on your CLI. For example, to benchmark a dense BERT ONNX model fine-tuned on the SST2 dataset where the model path is the minimum input required to get started, run:
+After `pip install deepsparse`, the benchmark tool is available on your CLI. For example, to benchmark a dense BERT ONNX model fine-tuned on the MNLI dataset where the model path is the minimum input required to get started, run:
 
 ```
-deepsparse.benchmark zoo:nlp/text_classification/bert-base/pytorch/huggingface/sst2/base-none
+deepsparse.benchmark zoo:nlp/text_classification/bert-base/pytorch/huggingface/mnli/base-none
 ```
 __ __
 ### Usage
@@ -94,7 +94,7 @@ optional arguments:
 Example CLI command for benchmarking an ONNX model from the SparseZoo and saving the results to a `benchmark.json` file:
 
 ```
-deepsparse.benchmark zoo:nlp/text_classification/bert-base/pytorch/huggingface/sst2/base-none -x benchmark.json
+deepsparse.benchmark zoo:nlp/text_classification/bert-base/pytorch/huggingface/mnli/base-none -x benchmark.json
 ```
 Output of the JSON file:
 
@@ -108,10 +108,10 @@ To run a sparse FP32 MobileNetV1 at batch size 16 for 10 seconds for throughput
 deepsparse.benchmark zoo:cv/classification/mobilenet_v1-1.0/pytorch/sparseml/imagenet/pruned-moderate --batch_size 16 --time 10 --scenario async --num_streams 8
 ```
 
-To run a sparse quantized INT8 6-layer BERT at batch size 1 for latency:
+To run a sparse quantized INT8 BERT at batch size 1 for latency:
 
 ```
-deepsparse.benchmark zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/pruned_quant_6layers-aggressive_96 --batch_size 1 --scenario sync
+deepsparse.benchmark zoo:nlp/question_answering/bert-large/pytorch/huggingface/squad/pruned90_quant-none --batch_size 1 --scenario sync
 ```
 __ __
 ### ⚡ Inference Scenarios
@@ -341,4 +341,4 @@ Mean Latency Breakdown (ms/batch):
      engine_prompt_prefill_single: 19.0412
      engine_token_generation: 19603.0353
      engine_token_generation_single: 19.1170
-```
+```
diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md
@@ -1,4 +1,4 @@
-# Hugging Face Transformer Inference Pipelines
+x# Hugging Face Transformer Inference Pipelines
 
 
 DeepSparse allows accelerated inference, serving, and benchmarking of sparsified [Hugging Face Transformer](https://github.com/huggingface/transformers) models.  
@@ -208,7 +208,7 @@ Spinning up:
 ```bash
 deepsparse.server \
     task sentiment-analysis \
-    --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/12layer_pruned80_quant-none-vnni"
+    --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/pruned80_quant-none-vnni"
 ```
 
 Making a request:
@@ -314,7 +314,7 @@ Spinning up:
 ```bash
 deepsparse.server \
     task token-classification \
-    --model_path "zoo:nlp/token_classification/bert-base/pytorch/huggingface/conll2003/12layer_pruned80_quant-none-vnni"
+    --model_path "zoo:nlp/token_classification/bert-base/pytorch/huggingface/conll2003/pruned90-none"
 ```
 
 Making a request: