From 921f42392a8ec4b9c8756e8be7d8527b342a1bab Mon Sep 17 00:00:00 2001
From: Naman Nandan <namankt55@gmail.com>
Date: Tue, 22 Aug 2023 14:23:57 -0700
Subject: [PATCH 01/11] Issue warning about allowed_urls when default value is
 used (#2534)

Co-authored-by: Naman Nandan <namannan@amazon.com>
---
 .../java/org/pytorch/serve/util/ConfigManager.java | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
index 9a7c0d657e..294abd9fbe 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
@@ -45,6 +45,7 @@
 import org.apache.commons.io.IOUtils;
 import org.pytorch.serve.servingsdk.snapshot.SnapshotSerializer;
 import org.pytorch.serve.snapshot.SnapshotSerializerFactory;
+import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public final class ConfigManager {
@@ -111,6 +112,9 @@ public final class ConfigManager {
     private static final String MODEL_CONFIG = "models";
     private static final String VERSION = "version";
 
+    // Configuration default values
+    private static final String DEFAULT_TS_ALLOWED_URLS = "file://.*|http(s)?://.*";
+
     // Variables which are local
     public static final String MODEL_METRICS_LOGGER = "MODEL_METRICS";
     public static final String MODEL_LOGGER = "MODEL_LOG";
@@ -136,6 +140,7 @@ public final class ConfigManager {
     private String hostName;
     private Map<String, Map<String, JsonObject>> modelConfig = new HashMap<>();
     private String torchrunLogDir;
+    private Logger logger = LoggerFactory.getLogger(ConfigManager.class);
 
     private ConfigManager(Arguments args) throws IOException {
         prop = new Properties();
@@ -234,6 +239,13 @@ private ConfigManager(Arguments args) throws IOException {
         }
 
         setModelConfig();
+
+        // Issue warnining about URLs that can be accessed when loading models
+        if (prop.getProperty(TS_ALLOWED_URLS, DEFAULT_TS_ALLOWED_URLS) == DEFAULT_TS_ALLOWED_URLS) {
+            logger.warn(
+                    "Your torchserve instance can access any URL to load models. "
+                            + "When deploying to production, make sure to limit the set of allowed_urls in config.properties");
+        }
     }
 
     public static String readFile(String path) throws IOException {
@@ -783,7 +795,7 @@ private static int getAvailableGpu() {
     }
 
     public List<String> getAllowedUrls() {
-        String allowedURL = prop.getProperty(TS_ALLOWED_URLS, "file://.*|http(s)?://.*");
+        String allowedURL = prop.getProperty(TS_ALLOWED_URLS, DEFAULT_TS_ALLOWED_URLS);
         return Arrays.asList(allowedURL.split(","));
     }
 

From b37296d4be27679cba234873c65e63c31df2cfea Mon Sep 17 00:00:00 2001
From: Jagadeesh J <jagadeeshj@ideas2it.com>
Date: Thu, 24 Aug 2023 22:22:53 +0530
Subject: [PATCH 02/11] fix metrics for k8s setup (#2473)

* fix metrics for k8s setup

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* fix lint error

---------

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
Co-authored-by: Geeta Chauhan <4461127+chauhang@users.noreply.github.com>
---
 .pre-commit-config.yaml                   |  1 +
 kubernetes/EKS/config.properties          |  2 ++
 kubernetes/GKE/config.properties          |  2 ++
 kubernetes/Helm/templates/torchserve.yaml |  5 ++--
 kubernetes/README.md                      | 29 ++++++++++++++---------
 5 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ec9f575678..7af6034e2b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,6 +12,7 @@ repos:
       - id: check-json
       - id: check-toml
       - id: check-yaml
+        args: [--allow-multiple-documents, --unsafe]
       - id: end-of-file-fixer
       - id: mixed-line-ending
       - id: trailing-whitespace
diff --git a/kubernetes/EKS/config.properties b/kubernetes/EKS/config.properties
index 67f9d10cce..e6003a92c8 100644
--- a/kubernetes/EKS/config.properties
+++ b/kubernetes/EKS/config.properties
@@ -1,6 +1,8 @@
 inference_address=http://0.0.0.0:8080
 management_address=http://0.0.0.0:8081
 metrics_address=http://0.0.0.0:8082
+enable_metrics_api=true
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_gpu=1
 number_of_netty_threads=32
diff --git a/kubernetes/GKE/config.properties b/kubernetes/GKE/config.properties
index 67f9d10cce..e6003a92c8 100644
--- a/kubernetes/GKE/config.properties
+++ b/kubernetes/GKE/config.properties
@@ -1,6 +1,8 @@
 inference_address=http://0.0.0.0:8080
 management_address=http://0.0.0.0:8081
 metrics_address=http://0.0.0.0:8082
+enable_metrics_api=true
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_gpu=1
 number_of_netty_threads=32
diff --git a/kubernetes/Helm/templates/torchserve.yaml b/kubernetes/Helm/templates/torchserve.yaml
index e847246100..71cecfb56b 100644
--- a/kubernetes/Helm/templates/torchserve.yaml
+++ b/kubernetes/Helm/templates/torchserve.yaml
@@ -6,13 +6,14 @@ metadata:
   labels:
     app: torchserve
   annotations:
-    prometheus.io/scrape: 'true'
+    prometheus.io/scrape: "true"
+    prometheus.io/path: /metrics
     prometheus.io/port: '8082'
 spec:
   ports:
   - name: preds
     port: {{ .Values.torchserve.inference_port }}
-    targetPort: ts 
+    targetPort: ts
   - name: mdl
     port: {{ .Values.torchserve.management_port }}
     targetPort: ts-management
diff --git a/kubernetes/README.md b/kubernetes/README.md
index f94f15f937..9575499cea 100644
--- a/kubernetes/README.md
+++ b/kubernetes/README.md
@@ -1,5 +1,5 @@
 # Torchserve on Kubernetes
-  
+
 ## Overview
 
 This page demonstrates a Torchserve deployment in Kubernetes using Helm Charts. It uses the DockerHub Torchserve Image for the pods and a PersistentVolume for storing config / model files.
@@ -66,7 +66,7 @@ persitant_volume:
   size: 1Gi
 ```
 
-To install Torchserve run ```helm install ts .```  
+To install Torchserve run ```helm install ts .```
 
 ```bash
 ubuntu@ip-172-31-50-36:~/serve/kubernetes/Helm$ helm install ts .
@@ -207,12 +207,6 @@ helm repo add grafana https://grafana.github.io/helm-charts
 helm install grafana grafana/grafana
 ```
 
-Get admin user password by running:
-
-```bash
-kubectl get secret --namespace default grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
-```
-
 ## Add prometheus as data source in grafana
 
 ```bash
@@ -233,8 +227,6 @@ kubectl get pod prometheus-server-f8677599b-xmjbt -o jsonpath='{.status.podIPs[0
 192.168.52.141
 ```
 
-![Add data source](images/grafana_datasource.png)
-
 ## Expose grafana with loadbalancer
 
 ```bash
@@ -243,9 +235,24 @@ kubectl patch service grafana -p '{"spec": {"type": "LoadBalancer"}}'
 kubectl get svc grafana -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
 ```
 
+Get admin user password by running:
+
+```bash
+kubectl get secret --namespace default grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
+```
+
 ## Login to grafana
 
-`<http://your.grafana.loadbalancer.address:3000>`
+<b>Username:</b> admin
+<b>Password:</b> <--The password got from previous step-->
+
+Open Grafana in browser with the url - `<http://your.grafana.loadbalancer.address:3000>`
+
+Add Prometheus data source
+
+![Add data source](images/grafana_datasource.png)
+
+The TS metrics will be available in Prometheus for Grafana dashboards.
 
 ## Logging
 

From cd7c47efd8b7d1f3b9650a3457f72995590602c8 Mon Sep 17 00:00:00 2001
From: Jagadeesh J <jagadeeshj@ideas2it.com>
Date: Thu, 24 Aug 2023 23:06:18 +0530
Subject: [PATCH 03/11] fix kserve storage optional package (#2537)

* fix kserve storage optional package

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* upgrade kserve at Docker dev

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

---------

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
---
 kubernetes/kserve/Dockerfile.dev                    | 5 ++---
 kubernetes/kserve/kserve_wrapper/TorchserveModel.py | 3 ++-
 kubernetes/kserve/requirements.txt                  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kubernetes/kserve/Dockerfile.dev b/kubernetes/kserve/Dockerfile.dev
index 4e7970c11e..731a0e8e2b 100644
--- a/kubernetes/kserve/Dockerfile.dev
+++ b/kubernetes/kserve/Dockerfile.dev
@@ -17,7 +17,6 @@ FROM ${BASE_IMAGE} AS compile-image
 ARG BASE_IMAGE
 ARG BRANCH_NAME=master
 ARG MACHINE_TYPE=cpu
-ARG BRANCH_NAME_KF=master
 ENV PYTHONUNBUFFERED TRUE
 
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
@@ -46,7 +45,7 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
   && rm -rf /var/lib/apt/lists/* \
   && cd /tmp \
 
-RUN update-alternatives --remove python /usr/bin/python \
+  RUN update-alternatives --remove python /usr/bin/python \
   && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1
 
 #ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
@@ -62,7 +61,7 @@ RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
   && git checkout ${BRANCH_NAME} \
   && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev  --cuda $CUDA_VERSION; fi \
   && python ts_scripts/install_from_src.py \
-  && python -m pip install captum transformers kserve \
+  && python -m pip install captum transformers kserve[storage]>=0.11.0 \
   && python -m pip install . \
   && useradd -m model-server \
   && mkdir -p /home/model-server/tmp \
diff --git a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
index aa28a50aa7..b85aa52e01 100644
--- a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
+++ b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
@@ -6,6 +6,7 @@
 import kserve
 from kserve.errors import ModelMissingError
 from kserve.model import Model as Model
+from kserve.storage import Storage
 
 logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
 
@@ -53,7 +54,7 @@ def load(self) -> bool:
         """This method validates model availabilty in the model directory
         and sets ready flag to true.
         """
-        model_path = pathlib.Path(kserve.Storage.download(self.model_dir))
+        model_path = pathlib.Path(Storage.download(self.model_dir))
         paths = list(pathlib.Path(model_path).glob("*.mar"))
         existing_paths = [path for path in paths if path.exists()]
         if len(existing_paths) == 0:
diff --git a/kubernetes/kserve/requirements.txt b/kubernetes/kserve/requirements.txt
index 81199e737f..d38cdf548e 100644
--- a/kubernetes/kserve/requirements.txt
+++ b/kubernetes/kserve/requirements.txt
@@ -1,3 +1,3 @@
-kserve>=0.9.0
+kserve[storage]>=0.11.0
 transformers
 captum

From 2ff502090c24b51fb22c0038ae049847940e8369 Mon Sep 17 00:00:00 2001
From: Naman Nandan <namankt55@gmail.com>
Date: Thu, 24 Aug 2023 11:25:48 -0700
Subject: [PATCH 04/11] Updated example for custom metrics and add backwards
 compatibility warnings and upgrade guide for metrics APIs (#2516)

* Add backward compatibility issues to doc

* Update example for custom metrics

* fix lint error

* Update custom metrics example to work with backwards compatible API

* Update custom metrics API documentation

* Fix linter error

* fix documentation

---------
---
 docs/metrics.md                               | 153 +++++++++---
 examples/custom_metrics/README.md             | 217 ++++++++++++------
 examples/custom_metrics/config.properties     |  12 +
 examples/custom_metrics/metrics.yaml          | 103 +++++++++
 examples/custom_metrics/mnist_handler.py      | 107 ++++++++-
 .../custom_metrics/torchserve_custom.mtail    |  24 --
 ts/metrics/metric_cache_yaml_impl.py          |   2 +-
 ts_scripts/spellcheck_conf/wordlist.txt       |   7 +
 8 files changed, 490 insertions(+), 135 deletions(-)
 create mode 100644 examples/custom_metrics/config.properties
 create mode 100644 examples/custom_metrics/metrics.yaml
 delete mode 100644 examples/custom_metrics/torchserve_custom.mtail

diff --git a/docs/metrics.md b/docs/metrics.md
index 9993948683..48b2065feb 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -10,6 +10,7 @@
 * [Custom Metrics API](#custom-metrics-api)
 * [Logging custom metrics](#log-custom-metrics)
 * [Metrics YAML Parsing and Metrics API example](#Metrics-YAML-File-Parsing-and-Metrics-API-Custom-Handler-Example)
+* [Backwards compatibility warnings and upgrade guide](#backwards-compatibility-warnings-and-upgrade-guide)
 
 ## Introduction
 
@@ -28,7 +29,7 @@ Metrics are collected by default at the following locations in `log` mode:
 
 The location of log files and metric files can be configured in the [log4j2.xml](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/log4j2.xml) file
 
-In `prometheus` mode, all metrics are made available in prometheus format via the [metrics](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md) API endpoint.
+In `prometheus` mode, all metrics are made available in prometheus format via the [metrics API endpoint](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md).
 
 ## Frontend Metrics
 
@@ -187,12 +188,17 @@ model_metrics:  # backend metrics
 ```
 
 
-Default metrics are provided in the [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file, but the user can either delete them to their liking / ignore them altogether, because these metrics will not be emitted unless they are edited.
+Note that **only** the metrics defined in the **metrics configuration file** can be emitted to logs or made available via the metrics API endpoint. This is done to ensure that the metrics configuration file serves as a central inventory of all the metrics that Torchserve can emit.
+
+Default metrics are provided in the [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file, but the user can either delete them to their liking / ignore them altogether, because these metrics will not be emitted unless they are edited.\
+When adding custom `model_metrics` in the metrics configuration file, ensure to include `ModelName` and `Level` dimension names towards the end of the list of dimensions since they are included by default by the following custom metrics APIs:
+[add_metric](#function-api-to-add-generic-metrics-with-default-dimensions), [add_counter](#add-counter-based-metrics),
+[add_time](#add-time-based-metrics), [add_size](#add-size-based-metrics) or [add_percent](#add-percentage-based-metrics).
 
 
 ### How it works
 
-Whenever torchserve starts, the [backend worker](https://github.com/pytorch/serve/blob/master/ts/model_service_worker.py) initializes `service.context.metrics` with the [MetricsCache](https://github.com/pytorch/serve/blob/master/ts/metrics/metric_cache_yaml_impl.py) object. The `model_metrics` (backend metrics) section within the specified yaml file will be parsed, and Metric objects will be created based on the parsed section and added  that are added to the cache.
+Whenever torchserve starts, the [backend worker](https://github.com/pytorch/serve/blob/master/ts/model_service_worker.py) initializes `service.context.metrics` with the [MetricsCache](https://github.com/pytorch/serve/blob/master/ts/metrics/metric_cache_yaml_impl.py) object. The `model_metrics` (backend metrics) section within the specified yaml file will be parsed, and Metric objects will be created based on the parsed section and added to the cache.
 
 This is all done internally, so the user does not have to do anything other than specifying the desired yaml file.
 
@@ -243,7 +249,7 @@ When adding any metric via Metrics API, users have the ability to override the m
 `metric_type=MetricTypes.[COUNTER/GAUGE/HISTOGRAM]`.
 
 ```python
-metric1 = metrics.add_metric("GenericMetric", unit=unit, dimension_names=["name1", "name2", ...], metric_type=MetricTypes.GAUGE)
+metric1 = metrics.add_metric_to_cache("GenericMetric", unit=unit, dimension_names=["name1", "name2", ...], metric_type=MetricTypes.GAUGE)
 metric.add_or_update(value, dimension_values=["value1", "value2", ...])
 
 # Backwards compatible, combines the above two method calls
@@ -311,31 +317,35 @@ dimN= Dimension(name_n, value_n)
 
 One can add metrics with generic units using the following function.
 
-Function API
+#### Function API to add generic metrics without default dimensions
 
 ```python
-    def add_metric(self, metric_name: str, unit: str, idx=None, dimension_names: list = None,
-                   metric_type: MetricTypes = MetricTypes.COUNTER) -> None:
+    def add_metric_to_cache(
+        self,
+        metric_name: str,
+        unit: str,
+        dimension_names: list = [],
+        metric_type: MetricTypes = MetricTypes.COUNTER,
+    ) -> CachingMetric:
         """
-        Create a new metric and add into cache.
-            Add a metric which is generic with custom metrics
+        Create a new metric and add into cache. Override existing metric if already present.
 
         Parameters
         ----------
-        metric_name: str
+        metric_name str
             Name of metric
-        value: int, float
-            value of metric
-        unit: str
-            unit of metric
-        idx: int
-            request_id index in batch
-        dimensions: list
-            list of dimensions for the metric
-        metric_type: MetricTypes
-            Type of metric
+        unit str
+            unit can be one of ms, percent, count, MB, GB or a generic string
+        dimension_names list
+            list of dimension name strings for the metric
+        metric_type MetricTypes
+            Type of metric Counter, Gauge, Histogram
+        Returns
+        -------
+        newly created Metrics object
         """
 
+
     def add_or_update(
         self,
         value: int or float,
@@ -360,10 +370,52 @@ Function API
 # Add Distance as a metric
 # dimensions = [dim1, dim2, dim3, ..., dimN]
 # Assuming batch size is 1 for example
-metric = metrics.add_metric('DistanceInKM', unit='km', dimension_names=[...])
+metric = metrics.add_metric_to_cache('DistanceInKM', unit='km', dimension_names=[...])
 metric.add_or_update(distance, dimension_values=[...])
 ```
 
+Note that calling `add_metric_to_cache` will not emit the metric, `add_or_update` will need to be called on the metric object as shown above.
+
+#### Function API to add generic metrics with default dimensions
+
+```python
+    def add_metric(
+        self,
+        name: str,
+        value: int or float,
+        unit: str,
+        idx: str = None,
+        dimensions: list = [],
+        metric_type: MetricTypes = MetricTypes.COUNTER,
+    ):
+        """
+        Add a generic metric
+            Default metric type is counter
+
+        Parameters
+        ----------
+        name : str
+            metric name
+        value: int or float
+            value of the metric
+        unit: str
+            unit of metric
+        idx: str
+            request id to be associated with the metric
+        dimensions: list
+            list of Dimension objects for the metric
+        metric_type MetricTypes
+            Type of metric Counter, Gauge, Histogram
+        """
+```
+
+```python
+# Add Distance as a metric
+# dimensions = [dim1, dim2, dim3, ..., dimN]
+metric = metrics.add_metric('DistanceInKM', value=10, unit='km', dimensions=[...])
+```
+
+
 ### Add time-based metrics
 
 **Time-based metrics are defaulted to a `GAUGE` metric type**
@@ -373,7 +425,7 @@ Add time-based by invoking the following method:
 Function API
 
 ```python
-    def add_time(self, metric_name: str, value: int or float, idx=None, unit: str = 'ms', dimensions: list = None,
+    def add_time(self, name: str, value: int or float, idx=None, unit: str = 'ms', dimensions: list = None,
                  metric_type: MetricTypes = MetricTypes.GAUGE):
         """
         Add a time based metric like latency, default unit is 'ms'
@@ -381,7 +433,7 @@ Function API
 
         Parameters
         ----------
-        metric_name : str
+        name : str
             metric name
         value: int
             value of metric
@@ -418,7 +470,7 @@ Add size-based metrics by invoking the following method:
 Function API
 
 ```python
-    def add_size(self, metric_name: str, value: int or float, idx=None, unit: str = 'MB', dimensions: list = None,
+    def add_size(self, name: str, value: int or float, idx=None, unit: str = 'MB', dimensions: list = None,
                  metric_type: MetricTypes = MetricTypes.GAUGE):
         """
         Add a size based metric
@@ -426,7 +478,7 @@ Function API
 
         Parameters
         ----------
-        metric_name : str
+        name : str
             metric name
         value: int, float
             value of metric
@@ -463,7 +515,7 @@ Percentage based metrics can be added by invoking the following method:
 Function API
 
 ```python
-    def add_percent(self, metric_name: str, value: int or float, idx=None, dimensions: list = None,
+    def add_percent(self, name: str, value: int or float, idx=None, dimensions: list = None,
                     metric_type: MetricTypes = MetricTypes.GAUGE):
         """
         Add a percentage based metric
@@ -471,7 +523,7 @@ Function API
 
         Parameters
         ----------
-        metric_name : str
+        name : str
             metric name
         value: int, float
             value of metric
@@ -485,6 +537,8 @@ Function API
 
 ```
 
+**Inferred unit**: `percent`
+
 To add custom percentage-based metrics:
 
 ```python
@@ -503,14 +557,13 @@ Counter based metrics can be added by invoking the following method
 Function API
 
 ```python
-    def add_counter(self, metric_name: str, value: int or float, idx=None, dimensions: list = None,
-                    metric_type: MetricTypes = MetricTypes.COUNTER):
+    def add_counter(self, name: str, value: int or float, idx=None, dimensions: list = None):
         """
         Add a counter metric or increment an existing counter metric
             Default metric type is counter
         Parameters
         ----------
-        metric_name : str
+        name : str
             metric name
         value: int or float
             value of metric
@@ -518,11 +571,11 @@ Function API
             request_id index in batch
         dimensions: list
             list of dimensions for the metric
-        metric_type: MetricTypes
-           type for defining different operations, defaulted to counter metric type for Counter metrics
         """
 ```
 
+**Inferred unit**: `count`
+
 ### Getting a metric
 
 Users can get a metric from the cache. The Metric object is returned, so the user can access the methods of the Metric: (i.e. `Metric.update(value)`, `Metric.__str__`)
@@ -622,3 +675,39 @@ class CustomHandlerExample:
         # except this time with gauge metric type object
         metrics.add_size("GaugeModelMetricNameExample", 42.5)
 ```
+
+## Backwards compatibility warnings and upgrade guide
+1. Starting [v0.6.1](https://github.com/pytorch/serve/releases/tag/v0.6.1), the `add_metric` API signature changed\
+   from: [add_metric(name, value, unit, idx=None, dimensions=None)](https://github.com/pytorch/serve/blob/61f1c4182e6e864c9ef1af99439854af3409d325/ts/metrics/metrics_store.py#L184)\
+   to: [add_metric(metric_name, unit, dimension_names=None, metric_type=MetricTypes.COUNTER)](https://github.com/pytorch/serve/blob/35ef00f9e62bb7fcec9cec92630ae757f9fb0db0/ts/metrics/metric_cache_abstract.py#L272).\
+   In versions greater than v0.8.1 the `add_metric` API signature was updated to support backwards compatibility:\
+   from: [add_metric(metric_name, unit, dimension_names=None, metric_type=MetricTypes.COUNTER)](https://github.com/pytorch/serve/blob/35ef00f9e62bb7fcec9cec92630ae757f9fb0db0/ts/metrics/metric_cache_abstract.py#L272)\
+   to: `add_metric(name, value, unit, idx=None, dimensions=[], metric_type=MetricTypes.COUNTER)`\
+   Usage of the new API is shown [above](#specifying-metric-types).\
+   **Upgrade paths**:
+   - **[< v0.6.1] to [v0.6.1 - v0.8.1]**\
+   There are two approaches available when migrating to the new custom metrics API:
+     - Replace the call to `add_metric` with calls to the following methods:
+       ```python
+       metric1 = metrics.add_metric("GenericMetric", unit=unit, dimension_names=["name1", "name2", ...], metric_type=MetricTypes.GAUGE)
+       metric1.add_or_update(value, dimension_values=["value1", "value2", ...])
+       ```
+     - Replace the call to `add_metric` in versions prior to v0.6.1 with one of the suitable custom metrics APIs where applicable: [add_counter](#add-counter-based-metrics), [add_time](#add-time-based-metrics),
+       [add_size](#add-size-based-metrics) or [add_percent](#add-percentage-based-metrics)
+   - **[< v0.6.1] to [> v0.8.1]**\
+     The call to `add_metric` is backwards compatible but the metric type is inferred to be `COUNTER`. If the metric is of a different type, an additional argument `metric_type` will need to be provided to the `add_metric`
+     call shown below
+     ```python
+     metrics.add_metric(name='GenericMetric', value=10, unit='count', dimensions=[...], metric_type=MetricTypes.GAUGE)
+     ```
+   - **[v0.6.1 - v0.8.1] to [> v0.8.1]**\
+     Replace the call to `add_metric` with `add_metric_to_cache`.
+2. Starting [v0.8.0](https://github.com/pytorch/serve/releases/tag/v0.8.0), only metrics that are defined in the metrics config file(default: [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml))
+   are either all logged to `ts_metrics.log` and `model_metrics.log` or made available via the [metrics API endpoint](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md)
+   based on the `metrics_mode` configuration as described [above](#introduction).\
+   The default `metrics_mode` is `log` mode.\
+   This is unlike in previous versions where all metrics were only logged to `ts_metrics.log` and `model_metrics.log` except for `ts_inference_requests_total`, `ts_inference_latency_microseconds` and `ts_queue_latency_microseconds`
+   which were only available via the metrics API endpoint.\
+   **Upgrade paths**:
+   - **[< v0.8.0] to [>= v0.8.0]**\
+     Specify all the custom metrics added to the custom handler in the metrics configuration file as shown [above](#central-metrics-yaml-file-definition).
diff --git a/examples/custom_metrics/README.md b/examples/custom_metrics/README.md
index 149a71cf8d..53199ab3b3 100644
--- a/examples/custom_metrics/README.md
+++ b/examples/custom_metrics/README.md
@@ -1,93 +1,172 @@
-# Monitoring Torchserve custom metrics with mtail metrics exporter and prometheus
+# Torchserve custom metrics with prometheus support
 
-In this example, we show how to use a pre-trained custom MNIST model and export the custom metrics using mtail and prometheus
+In this example, we show how to use a pre-trained custom MNIST model and export custom metrics using prometheus.
 
-We used the following pytorch example to train the basic MNIST model for digit recognition : https://github.com/pytorch/examples/tree/master/mnist
+We use the following pytorch example of MNIST model for digit recognition : https://github.com/pytorch/examples/tree/master/mnist
 
-Run the commands given in following steps from the parent directory of the root of the repository. For example, if you cloned the repository into /home/my_path/serve, run the steps from /home/my_path
+Run the commands given in following steps from the root directory of the repository. For example, if you cloned the repository into /home/my_path/serve, run the steps from /home/my_path/serve
 
 ## Steps
 
-- Step 1: In this example we introduce a new custom metric `SizeOfImage` in the custom handler and export it using mtail.
-
-  ```python
-  def preprocess(self, data):
-    metrics = self.context.metrics
-    input = data[0].get('body')
-    metrics.add_size('SizeOfImage', len(input) / 1024, None, 'kB')
-    return ImageClassifier.preprocess(self, data)
-  ```
-
-  Refer: [Custom Metrics](https://github.com/pytorch/serve/blob/master/docs/metrics.md#custom-metrics-api)
+- Step 1: In this example we add the following custom metrics and access them in prometheus format via the [metrics API endpoint](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md):
+  - InferenceRequestCount
+  - InitializeCallCount
+  - PreprocessCallCount
+  - PostprocessCallCount
+  - RequestBatchSize
+  - SizeOfImage
+  - HandlerMethodTime
+  - ExamplePercentMetric
+
+  The custom metrics configuration file `metrics.yaml` in this example builds on top of the [default metrics configuration file](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) to include the custom metrics listed above.
+  The `config.properties` file in this example configures torchserve to use the custom metrics configuration file and sets the `metrics_mode` to `prometheus`. The custom handler
+  `mnist_handler.py` updates the metrics listed above.
+
+  Refer: [Custom Metrics](https://github.com/pytorch/serve/blob/master/docs/metrics.md#custom-metrics-api)\
   Refer: [Custom Handler](https://github.com/pytorch/serve/blob/master/docs/custom_service.md#custom-handlers)
 
-- Step 2: Create a torch model archive using the torch-model-archiver utility to archive the above files.
+- Step 2: Create a torch model archive using the torch-model-archiver utility.
 
   ```bash
   torch-model-archiver --model-name mnist --version 1.0 --model-file examples/image_classifier/mnist/mnist.py --serialized-file examples/image_classifier/mnist/mnist_cnn.pt --handler examples/custom_metrics/mnist_handler.py
   ```
 
-- Step 3: Register the model on TorchServe using the above model archive file.
+- Step 3: Register the model to torchserve using the above model archive file.
 
   ```bash
   mkdir model_store
   mv mnist.mar model_store/
-  torchserve --start --model-store model_store --models mnist=mnist.mar
-  ```
-
-- Step 4: Install [mtail](https://github.com/google/mtail/releases)
-
-  ```bash
-  wget https://github.com/google/mtail/releases/download/v3.0.0-rc47/mtail_3.0.0-rc47_Linux_x86_64.tar.gz
-  tar -xvzf mtail_3.0.0-rc47_Linux_x86_64.tar.gz
-  chmod +x mtail
+  torchserve --ncs --start --model-store model_store --ts-config examples/custom_metrics/config.properties --models mnist=mnist.mar
   ```
 
-- Step 5: Create a mtail program. In this example we using a program to export default custom metrics.
-
-  Refer: [mtail Programming Guide](https://google.github.io/mtail/Programming-Guide.html).
-
-- Step 6: Start mtail export by running the below command
-
-  ```bash
-  ./mtail --progs examples/custom_metrics/torchserve_custom.mtail --logs logs/model_metrics.log
-  ```
-
-  The mtail program parses the log file extracts info by matching patterns and presents as JSON, Prometheus and other databases. https://google.github.io/mtail/Interoperability.html
-
-- Step 7: Make Inference request
+- Step 4: Make Inference request
 
   ```bash
   curl http://127.0.0.1:8080/predictions/mnist -T examples/image_classifier/mnist/test_data/0.png
   ```
 
-  The inference request logs the time taken for prediction to the model_metrics.log file.
-  Mtail parses the file and is served at 3903 port
-
-  `http://localhost:3903`
-
-- Step 8: Sart Prometheus with mtailtarget added to scarpe config
-
-  - Download [Prometheus](https://prometheus.io/download/)
-
-  - Add mtail target added to scrape config in the config file
-
-  ```yaml
-  scrape_configs:
-    # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
-    - job_name: "prometheus"
-
-      # metrics_path defaults to '/metrics'
-      # scheme defaults to 'http'.
-
-      static_configs:
-        - targets: ["localhost:9090", "localhost:3903"]
-  ```
-
-  - Start Prometheus with config file
-
-  ```bash
-  ./prometheus --config.file prometheus.yml
-  ```
-
-  The exported logs from mtail are scraped by prometheus on 3903 port.
+- Step 5: Install prometheus using the instructions [here](https://prometheus.io/download/#prometheus).
+
+- Step 6: Create a minimal `prometheus.yaml` config file as below and run `./prometheus --config.file=prometheus.yaml`.
+
+```yaml
+global:
+  scrape_interval:     15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+    - targets: ['localhost:9090']
+  - job_name: 'torchserve'
+    static_configs:
+    - targets: ['localhost:8082'] #TorchServe metrics endpoint
+```
+
+- Step 7: Test metrics API endpoint
+```console
+curl http://127.0.0.1:8082/metrics
+
+# HELP Requests2XX Torchserve prometheus counter metric with unit: Count
+# TYPE Requests2XX counter
+Requests2XX{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP PredictionTime Torchserve prometheus gauge metric with unit: ms
+# TYPE PredictionTime gauge
+PredictionTime{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 62.78
+# HELP DiskUsage Torchserve prometheus gauge metric with unit: Gigabytes
+# TYPE DiskUsage gauge
+DiskUsage{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 8.438858032226562
+# HELP WorkerLoadTime Torchserve prometheus gauge metric with unit: Milliseconds
+# TYPE WorkerLoadTime gauge
+WorkerLoadTime{WorkerName="W-9000-mnist_1.0",Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 7425.0
+# HELP Requests5XX Torchserve prometheus counter metric with unit: Count
+# TYPE Requests5XX counter
+# HELP CPUUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE CPUUtilization gauge
+CPUUtilization{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 100.0
+# HELP WorkerThreadTime Torchserve prometheus gauge metric with unit: Milliseconds
+# TYPE WorkerThreadTime gauge
+WorkerThreadTime{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 3.0
+# HELP DiskAvailable Torchserve prometheus gauge metric with unit: Gigabytes
+# TYPE DiskAvailable gauge
+DiskAvailable{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 308.94310760498047
+# HELP ts_inference_requests_total Torchserve prometheus counter metric with unit: Count
+# TYPE ts_inference_requests_total counter
+ts_inference_requests_total{model_name="mnist",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP GPUMemoryUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE GPUMemoryUtilization gauge
+# HELP HandlerTime Torchserve prometheus gauge metric with unit: ms
+# TYPE HandlerTime gauge
+HandlerTime{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 62.64
+# HELP ts_inference_latency_microseconds Torchserve prometheus counter metric with unit: Microseconds
+# TYPE ts_inference_latency_microseconds counter
+ts_inference_latency_microseconds{model_name="mnist",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 64694.367
+# HELP MemoryUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE MemoryUtilization gauge
+MemoryUtilization{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 53.1
+# HELP MemoryAvailable Torchserve prometheus gauge metric with unit: Megabytes
+# TYPE MemoryAvailable gauge
+MemoryAvailable{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 7677.29296875
+# HELP PostprocessCallCount Torchserve prometheus counter metric with unit: count
+# TYPE PostprocessCallCount counter
+PostprocessCallCount{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP ExamplePercentMetric Torchserve prometheus histogram metric with unit: percent
+# TYPE ExamplePercentMetric histogram
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.005",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.01",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.025",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.05",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.075",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.1",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.25",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.5",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="0.75",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="1.0",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="2.5",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="5.0",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="7.5",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="10.0",} 0.0
+ExamplePercentMetric_bucket{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",le="+Inf",} 1.0
+ExamplePercentMetric_count{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+ExamplePercentMetric_sum{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 50.0
+# HELP GPUUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE GPUUtilization gauge
+# HELP MemoryUsed Torchserve prometheus gauge metric with unit: Megabytes
+# TYPE MemoryUsed gauge
+MemoryUsed{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 7903.734375
+# HELP QueueTime Torchserve prometheus gauge metric with unit: Milliseconds
+# TYPE QueueTime gauge
+QueueTime{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 0.0
+# HELP ts_queue_latency_microseconds Torchserve prometheus counter metric with unit: Microseconds
+# TYPE ts_queue_latency_microseconds counter
+ts_queue_latency_microseconds{model_name="mnist",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 115.79
+# HELP PreprocessCallCount Torchserve prometheus counter metric with unit: count
+# TYPE PreprocessCallCount counter
+PreprocessCallCount{ModelName="mnist",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP RequestBatchSize Torchserve prometheus gauge metric with unit: count
+# TYPE RequestBatchSize gauge
+RequestBatchSize{ModelName="mnist",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP SizeOfImage Torchserve prometheus gauge metric with unit: kB
+# TYPE SizeOfImage gauge
+SizeOfImage{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 0.265625
+# HELP Requests4XX Torchserve prometheus counter metric with unit: Count
+# TYPE Requests4XX counter
+# HELP HandlerMethodTime Torchserve prometheus gauge metric with unit: ms
+# TYPE HandlerMethodTime gauge
+HandlerMethodTime{MethodName="preprocess",ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 25.554895401000977
+# HELP InitializeCallCount Torchserve prometheus counter metric with unit: count
+# TYPE InitializeCallCount counter
+InitializeCallCount{ModelName="mnist",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP DiskUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE DiskUtilization gauge
+DiskUtilization{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 2.7
+# HELP GPUMemoryUsed Torchserve prometheus gauge metric with unit: Megabytes
+# TYPE GPUMemoryUsed gauge
+# HELP InferenceRequestCount Torchserve prometheus counter metric with unit: count
+# TYPE InferenceRequestCount counter
+InferenceRequestCount{Hostname="88665a372f4b.ant.amazon.com",} 1.0
+```
+
+- Step 8: Navigate to `http://localhost:9090/` on a browser to execute queries and create graphs
+
+<img width="1777" alt="Screenshot 2023-08-03 at 6 46 47 PM" src="https://github.com/pytorch/serve/assets/5276346/a87d6ee4-a760-4da8-b0f6-d461df7e500d">
diff --git a/examples/custom_metrics/config.properties b/examples/custom_metrics/config.properties
new file mode 100644
index 0000000000..02607ac36d
--- /dev/null
+++ b/examples/custom_metrics/config.properties
@@ -0,0 +1,12 @@
+metrics_mode=prometheus
+metrics_config=examples/custom_metrics/metrics.yaml
+models={\
+  "mnist": {\
+    "1.0": {\
+        "defaultVersion": true,\
+        "marName": "mnist.mar",\
+        "minWorkers": 1,\
+        "maxWorkers": 1\
+    }\
+  }\
+}
diff --git a/examples/custom_metrics/metrics.yaml b/examples/custom_metrics/metrics.yaml
new file mode 100644
index 0000000000..a4f31fdfe3
--- /dev/null
+++ b/examples/custom_metrics/metrics.yaml
@@ -0,0 +1,103 @@
+dimensions:
+  - &model_name "ModelName"
+  - &worker_name "WorkerName"
+  - &level "Level"
+  - &device_id "DeviceId"
+  - &hostname "Hostname"
+
+ts_metrics:
+  counter:
+    - name: Requests2XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: Requests4XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: Requests5XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: ts_inference_requests_total
+      unit: Count
+      dimensions: ["model_name", "model_version", "hostname"]
+    - name: ts_inference_latency_microseconds
+      unit: Microseconds
+      dimensions: ["model_name", "model_version", "hostname"]
+    - name: ts_queue_latency_microseconds
+      unit: Microseconds
+      dimensions: ["model_name", "model_version", "hostname"]
+  gauge:
+    - name: QueueTime
+      unit: Milliseconds
+      dimensions: [*level, *hostname]
+    - name: WorkerThreadTime
+      unit: Milliseconds
+      dimensions: [*level, *hostname]
+    - name: WorkerLoadTime
+      unit: Milliseconds
+      dimensions: [*worker_name, *level, *hostname]
+    - name: CPUUtilization
+      unit: Percent
+      dimensions: [*level, *hostname]
+    - name: MemoryUsed
+      unit: Megabytes
+      dimensions: [*level, *hostname]
+    - name: MemoryAvailable
+      unit: Megabytes
+      dimensions: [*level, *hostname]
+    - name: MemoryUtilization
+      unit: Percent
+      dimensions: [*level, *hostname]
+    - name: DiskUsage
+      unit: Gigabytes
+      dimensions: [*level, *hostname]
+    - name: DiskUtilization
+      unit: Percent
+      dimensions: [*level, *hostname]
+    - name: DiskAvailable
+      unit: Gigabytes
+      dimensions: [*level, *hostname]
+    - name: GPUMemoryUtilization
+      unit: Percent
+      dimensions: [*level, *device_id, *hostname]
+    - name: GPUMemoryUsed
+      unit: Megabytes
+      dimensions: [*level, *device_id, *hostname]
+    - name: GPUUtilization
+      unit: Percent
+      dimensions: [*level, *device_id, *hostname]
+
+model_metrics:
+  # Dimension "Hostname" is automatically added for model metrics in the backend
+  counter:
+    - name: InferenceRequestCount
+      unit: count
+      dimensions: []
+    - name: InitializeCallCount
+      unit: count
+      dimensions: [*model_name, *level]
+    - name: PreprocessCallCount
+      unit: count
+      dimensions: [*model_name]
+    - name: PostprocessCallCount
+      unit: count
+      dimensions: [*model_name, *level]
+  gauge:
+    - name: HandlerTime
+      unit: ms
+      dimensions: [*model_name, *level]
+    - name: PredictionTime
+      unit: ms
+      dimensions: [*model_name, *level]
+    - name: RequestBatchSize
+      unit: count
+      dimensions: ["ModelName"]
+    - name: SizeOfImage
+      unit: kB
+      dimensions: [*model_name, *level]
+    - name: HandlerMethodTime
+      unit: ms
+      dimensions: ["MethodName", *model_name, *level]
+  histogram:
+    - name: ExamplePercentMetric
+      unit: percent
+      dimensions: [*model_name, *level]
diff --git a/examples/custom_metrics/mnist_handler.py b/examples/custom_metrics/mnist_handler.py
index 919b3a8f83..632afd5a82 100644
--- a/examples/custom_metrics/mnist_handler.py
+++ b/examples/custom_metrics/mnist_handler.py
@@ -1,6 +1,9 @@
-import io
-from PIL import Image
+import time
+
 from torchvision import transforms
+
+from ts.metrics.dimension import Dimension
+from ts.metrics.metric_type_enum import MetricTypes
 from ts.torch_handler.image_classifier import ImageClassifier
 
 
@@ -8,13 +11,42 @@ class MNISTDigitClassifier(ImageClassifier):
     """
     MNISTDigitClassifier handler class. This handler extends class ImageClassifier from image_classifier.py, a
     default handler. This handler takes an image and returns the number in that image.
-
-    Here method postprocess() has been overridden while others are reused from parent class.
     """
 
     image_processing = transforms.Compose(
-        [transforms.ToTensor(),
-         transforms.Normalize((0.1307,), (0.3081,))])
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+
+    def initialize(self, context):
+        super().initialize(context)
+        metrics = context.metrics
+
+        # "add_metric_to_cache" will only register/override(if already present) a metric object in the metric cache and will not emit it
+        self.inf_request_count = metrics.add_metric_to_cache(
+            metric_name="InferenceRequestCount",
+            unit="count",
+            dimension_names=[],
+            metric_type=MetricTypes.COUNTER,
+        )
+        metrics.add_metric_to_cache(
+            metric_name="PreprocessCallCount",
+            unit="count",
+            dimension_names=["ModelName"],
+            metric_type=MetricTypes.COUNTER,
+        )
+
+        # "add_metric" will register the metric if not already present in metric cache,
+        # include the "ModelName" and "Level" dimensions by default and emit it
+        metrics.add_metric(
+            name="InitializeCallCount",
+            value=1,
+            unit="count",
+            dimensions=[
+                Dimension(name="ModelName", value=context.model_name),
+                Dimension(name="Level", value="Model"),
+            ],
+            metric_type=MetricTypes.COUNTER,
+        )
 
     def preprocess(self, data):
         """
@@ -27,10 +59,52 @@ def preprocess(self, data):
         Returns:
             tensor: Returns the tensor data of the input
         """
+        preprocess_start = time.time()
+
         metrics = self.context.metrics
-        input = data[0].get('body')
-        metrics.add_size('SizeOfImage', len(input) / 1024, None, 'kB')
-        return ImageClassifier.preprocess(self, data)
+
+        # "add_or_update" will emit the metric
+        self.inf_request_count.add_or_update(value=1, dimension_values=[])
+
+        # "get_metric" will fetch the corresponding metric from metric cache if present
+        preprocess_call_count_metric = metrics.get_metric(
+            metric_name="PreprocessCallCount", metric_type=MetricTypes.COUNTER
+        )
+        preprocess_call_count_metric.add_or_update(
+            value=1, dimension_values=[self.context.model_name]
+        )
+
+        request_batch_size_metric = metrics.get_metric(
+            metric_name="RequestBatchSize", metric_type=MetricTypes.GAUGE
+        )
+        request_batch_size_metric.add_or_update(
+            value=len(data), dimension_values=[self.context.model_name]
+        )
+
+        input = data[0].get("body")
+
+        # "add_size" will register the metric if not already present in metric cache,
+        # include the "ModelName" and "Level" dimensions by default and emit it
+        metrics.add_size(
+            name="SizeOfImage", value=len(input) / 1024, idx=None, unit="kB"
+        )
+
+        preprocessed_image = ImageClassifier.preprocess(self, data)
+
+        preprocess_stop = time.time()
+
+        # "add_time" will register the metric if not already present in metric cache,
+        # include the "ModelName" and "Level" dimensions by default and emit it
+        metrics.add_time(
+            name="HandlerMethodTime",
+            value=(preprocess_stop - preprocess_start) * 1000,
+            idx=None,
+            unit="ms",
+            dimensions=[Dimension(name="MethodName", value="preprocess")],
+            metric_type=MetricTypes.GAUGE,
+        )
+
+        return preprocessed_image
 
     def postprocess(self, data):
         """The post process of MNIST converts the predicted output response to a label.
@@ -41,4 +115,19 @@ def postprocess(self, data):
         Returns:
             list : A list of dictionary with predictons and explanations are returned.
         """
+        # "add_counter" will register the metric if not already present in metric cache,
+        # include the "ModelName" and "Level" dimensions by default and emit it
+        self.context.metrics.add_counter(
+            name="PostprocessCallCount", value=1, idx=None, dimensions=[]
+        )
+        # "add_percent" will register the metric if not already present in metric cache,
+        # include the "ModelName" and "Level" dimensions by default and emit it
+        self.context.metrics.add_percent(
+            name="ExamplePercentMetric",
+            value=50,
+            idx=None,
+            dimensions=[],
+            metric_type=MetricTypes.HISTOGRAM,
+        )
+
         return data.argmax(1).flatten().tolist()
diff --git a/examples/custom_metrics/torchserve_custom.mtail b/examples/custom_metrics/torchserve_custom.mtail
deleted file mode 100644
index 15e642d762..0000000000
--- a/examples/custom_metrics/torchserve_custom.mtail
+++ /dev/null
@@ -1,24 +0,0 @@
-counter request_count
-gauge image_size
-gauge model_name
-gauge level
-gauge host_name
-gauge request_id
-gauge time_stamp
-
-# Sample log
-# 2021-08-27 21:15:03,376 - PredictionTime.Milliseconds:109.74|#ModelName:bert,Level:Model|#hostname:ubuntu-ThinkPad-E14,requestID:09ed6c2c-9380-480d-a61a-66bfea958c1d,timestamp:1630079103
-# 2021-08-27 21:15:03,376 - HandlerTime.Milliseconds:109.74|#ModelName:bert,Level:Model|#hostname:ubuntu-ThinkPad-E14,requestID:09ed6c2c-9380-480d-a61a-66bfea958c1d,timestamp:1630079103
-# 2021-09-02 00:24:34,001 - InferenceTime.Milliseconds:3.05|#ModelName:mnist,Level:Model|#hostname:ubuntu-ThinkPad-E14,requestID:ce9a3631-e509-4a82-91c4-482cd2a15cd9,timestamp:1630522474
-
-const PATTERN /SizeOfImage\.Kilobytes:(\d+\.\d+)\|#ModelName:([a-zA-Z]+),Level:([a-zA-Z]+)\|#hostname:([a-zA-Z0-9-]+),requestID:([a-zA-Z0-9-]+),timestamp:([0-9]+)/
-
-PATTERN{
-  request_count++
-  image_size = $1
-  model_name = $2
-  level = $3
-  host_name = $4
-  request_id = $5
-  time_stamp = $6
-}
diff --git a/ts/metrics/metric_cache_yaml_impl.py b/ts/metrics/metric_cache_yaml_impl.py
index 7206c83c30..fa170dd816 100644
--- a/ts/metrics/metric_cache_yaml_impl.py
+++ b/ts/metrics/metric_cache_yaml_impl.py
@@ -109,7 +109,7 @@ def add_metric_to_cache(
         metric_type: MetricTypes = MetricTypes.COUNTER,
     ) -> CachingMetric:
         """
-        Create a new metric and add into cache
+        Create a new metric and add into cache. Override existing metric with same name if present.
 
         Parameters
         ----------
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 902439747a..7618579767 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1068,3 +1068,10 @@ chatGPT
 baseimage
 cuDNN
 Xformer
+ExamplePercentMetric
+HandlerMethodTime
+InferenceRequestCount
+PostprocessCallCount
+RequestBatchSize
+InitializeCallCount
+PreprocessCallCount

From 39e715ddce75ef469f1f701f4122b9277b63692e Mon Sep 17 00:00:00 2001
From: Jagadeesh J <jagadeeshj@ideas2it.com>
Date: Fri, 25 Aug 2023 00:38:30 +0530
Subject: [PATCH 05/11] feat: add KServe gRPC v2 support (#2176)

* feat: add KServe gRPC v2 support

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* feat: add utils to convert kserve pb to ts pb

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* add ts pb to kserve pb conversion method

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* Add pb python file generation step at docker build

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* fix: readme doc

 - add logs

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* update readme

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* fix lint errors

* fix kserve_v2 service envelop and test data

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* re-test

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* re-test

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

---------

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
Co-authored-by: Geeta Chauhan <4461127+chauhang@users.noreply.github.com>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
Co-authored-by: Mark Saroufim <marksaroufim@fb.com>
---
 kubernetes/kserve/Dockerfile                  | 15 ++-
 kubernetes/kserve/Dockerfile.dev              |  2 +
 kubernetes/kserve/README.md                   |  4 +-
 kubernetes/kserve/build_image.sh              |  9 +-
 .../v2/mnist/mnist_v2_bytes.json              |  3 +-
 .../v2/mnist/mnist_v2_bytes_grpc.json         | 11 +++
 .../v2/mnist/mnist_v2_tensor_grpc.json        | 12 +++
 kubernetes/kserve/kserve_wrapper/README.md    | 59 +++++++-----
 .../kserve_wrapper/TSModelRepository.py       |  7 +-
 .../kserve/kserve_wrapper/TorchserveModel.py  | 95 ++++++++++++++++++-
 kubernetes/kserve/kserve_wrapper/__main__.py  | 60 +++++++++---
 .../kserve/kserve_wrapper/gprc_utils.py       | 75 +++++++++++++++
 kubernetes/kserve/requirements.txt            |  3 +
 test/postman/kfv2_inference_data.json         |  2 +-
 ts/torch_handler/request_envelope/kservev2.py | 24 +++--
 ts_scripts/spellcheck_conf/wordlist.txt       |  3 +-
 16 files changed, 326 insertions(+), 58 deletions(-)
 create mode 100644 kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes_grpc.json
 create mode 100644 kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor_grpc.json
 create mode 100644 kubernetes/kserve/kserve_wrapper/gprc_utils.py

diff --git a/kubernetes/kserve/Dockerfile b/kubernetes/kserve/Dockerfile
index 95ea649a8e..eb32f579bc 100644
--- a/kubernetes/kserve/Dockerfile
+++ b/kubernetes/kserve/Dockerfile
@@ -1,13 +1,13 @@
 # syntax = docker/dockerfile:experimental
 #
 # Following comments have been shamelessly copied from https://github.com/pytorch/pytorch/blob/master/Dockerfile
-# 
+#
 # NOTE: To build this you will need a docker version > 18.06 with
 #       experimental enabled and DOCKER_BUILDKIT=1
 #
 #       If you do not use buildkit you are not going to have a good time
 #
-#       For reference: 
+#       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements
 
 ARG BASE_IMAGE=pytorch/torchserve:latest
@@ -24,9 +24,18 @@ RUN pip install -r requirements.txt
 COPY dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
 RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh
 COPY kserve_wrapper kserve_wrapper
+
+COPY ./*.proto ./kserve_wrapper/
+
+RUN python -m grpc_tools.protoc \
+    --proto_path=./kserve_wrapper \
+    --python_out=./kserve_wrapper \
+    --grpc_python_out=./kserve_wrapper \
+    ./kserve_wrapper/inference.proto \
+    ./kserve_wrapper/management.proto
+
 COPY config.properties config.properties
 
 USER model-server
 
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
-
diff --git a/kubernetes/kserve/Dockerfile.dev b/kubernetes/kserve/Dockerfile.dev
index 731a0e8e2b..54af943553 100644
--- a/kubernetes/kserve/Dockerfile.dev
+++ b/kubernetes/kserve/Dockerfile.dev
@@ -69,6 +69,8 @@ RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
   && chmod +x /usr/local/bin/dockerd-entrypoint.sh \
   && chown -R model-server /home/model-server \
   && cp -R kubernetes/kserve/kserve_wrapper /home/model-server/kserve_wrapper \
+  && cp frontend/server/src/main/resources/proto/*.proto /home/model-serve/kserve_wrapper \
+  && python -m grpc_tools.protoc --proto_path=/home/model-server/kserve_wrapper --python_out=/home/model-server/kserve_wrapper --grpc_python_out=/home/model-server/kserve_wrapper /home/model-server/kserve_wrapper/inference.proto /home/model-server/kserve_wrapper/management.proto \
   && cp kubernetes/kserve/config.properties /home/model-server/config.properties \
   && mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
 
diff --git a/kubernetes/kserve/README.md b/kubernetes/kserve/README.md
index c35cd2cabf..cf54a6ce73 100644
--- a/kubernetes/kserve/README.md
+++ b/kubernetes/kserve/README.md
@@ -30,10 +30,10 @@ Currently, KServe supports the Inference API for all the existing models but tex
 ./build_image.sh -g -t <repository>/<image>:<tag>
 ```
 
-### Docker Image Dev Build
+- To create dev image
 
 ```bash
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.dev -t pytorch/torchserve-kfs:latest-dev .
+./build_image.sh -g -d -t <repository>/<image>:<tag>
 ```
 
 ## Running Torchserve inference service in KServe cluster
diff --git a/kubernetes/kserve/build_image.sh b/kubernetes/kserve/build_image.sh
index ea7b587327..2f17596ee5 100755
--- a/kubernetes/kserve/build_image.sh
+++ b/kubernetes/kserve/build_image.sh
@@ -2,6 +2,7 @@
 
 DOCKER_TAG="pytorch/torchserve-kfs:latest"
 BASE_IMAGE="pytorch/torchserve:latest"
+DOCKER_FILE="Dockerfile"
 
 for arg in "$@"
 do
@@ -18,6 +19,10 @@ do
           BASE_IMAGE="pytorch/torchserve:latest-gpu"
           shift
           ;;
+        -d|--dev)
+          DOCKER_FILE="Dockerfile.dev"
+          shift
+          ;;
         -t|--tag)
           DOCKER_TAG="$2"
           shift
@@ -26,4 +31,6 @@ do
     esac
 done
 
-DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" .
+cp ../../frontend/server/src/main/resources/proto/*.proto .
+
+DOCKER_BUILDKIT=1 docker build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" .
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
index 683ada7b73..096c555598 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
@@ -1,9 +1,10 @@
 {
+    "id": "d3b15cad-50a2-4eaf-80ce-8b0a428bd298",
     "inputs": [
         {
             "data": ["iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII="],
             "datatype": "BYTES",
-            "name": "e8d5afed-0a56-4deb-ac9c-352663f51b93",
+            "name": "312a4eb0-0ca7-4803-a101-a6d2c18486fe",
             "shape": [-1]
         }
     ]
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes_grpc.json b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes_grpc.json
new file mode 100644
index 0000000000..44e25e9fc9
--- /dev/null
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes_grpc.json
@@ -0,0 +1,11 @@
+{
+    "model_name": "mnist",
+    "inputs": [{
+        "name": "312a4eb0-0ca7-4803-a101-a6d2c18486fe",
+        "shape": [-1],
+        "datatype": "BYTES",
+        "contents": {
+            "bytes_contents": ["iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII="]
+        }
+    }]
+}
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor_grpc.json b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor_grpc.json
new file mode 100644
index 0000000000..3fd601005e
--- /dev/null
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor_grpc.json
@@ -0,0 +1,12 @@
+{
+	"id": "d3b15cad-50a2-4eaf-80ce-8b0a428bd298",
+	"model_name": "mnist",
+	"inputs": [{
+		"name": "input-0",
+		"shape": [1, 28, 28],
+		"datatype": "FP32",
+		"contents": {
+			"fp32_contents": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23919999599456787, 0.011800000444054604, 0.1647000014781952, 0.4627000093460083, 0.7569000124931335, 0.4627000093460083, 0.4627000093460083, 0.23919999599456787, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05490000173449516, 0.7020000219345093, 0.9607999920845032, 0.9254999756813049, 0.9490000009536743, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9607999920845032, 0.9215999841690063, 0.3294000029563904, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.592199981212616, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.8353000283241272, 0.7529000043869019, 0.6980000138282776, 0.6980000138282776, 0.7059000134468079, 0.9961000084877014, 0.9961000084877014, 0.9451000094413757, 0.18039999902248383, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16859999299049377, 0.9215999841690063, 0.9961000084877014, 0.8863000273704529, 0.25099998712539673, 0.10980000346899033, 0.0471000000834465, 0.0, 0.0, 0.007799999788403511, 0.5019999742507935, 0.9882000088691711, 1.0, 0.6783999800682068, 0.06669999659061432, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.21960000693798065, 0.9961000084877014, 0.9922000169754028, 0.4196000099182129, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5254999995231628, 0.980400025844574, 0.9961000084877014, 0.29409998655319214, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.24709999561309814, 0.9961000084877014, 0.6195999979972839, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8666999936103821, 0.9961000084877014, 0.6157000064849854, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7608000040054321, 0.9961000084877014, 0.40389999747276306, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5881999731063843, 0.9961000084877014, 0.8353000283241272, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13330000638961792, 0.8626999855041504, 0.9373000264167786, 0.22750000655651093, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294000029563904, 0.9961000084877014, 0.8353000283241272, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.49410000443458557, 0.9961000084877014, 0.6705999970436096, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294000029563904, 0.9961000084877014, 0.8353000283241272, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8392000198364258, 0.9373000264167786, 0.2353000044822693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294000029563904, 0.9961000084877014, 0.8353000283241272, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8392000198364258, 0.7803999781608582, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294000029563904, 0.9961000084877014, 0.8353000283241272, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04309999942779541, 0.8587999939918518, 0.7803999781608582, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294000029563904, 0.9961000084877014, 0.8353000283241272, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.38429999351501465, 0.9961000084877014, 0.7803999781608582, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6352999806404114, 0.9961000084877014, 0.819599986076355, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.38429999351501465, 0.9961000084877014, 0.7803999781608582, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20000000298023224, 0.9333000183105469, 0.9961000084877014, 0.29409998655319214, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.38429999351501465, 0.9961000084877014, 0.7803999781608582, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.20000000298023224, 0.6470999717712402, 0.9961000084877014, 0.7646999955177307, 0.015699999406933784, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2587999999523163, 0.9451000094413757, 0.7803999781608582, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011800000444054604, 0.6549000144004822, 0.9961000084877014, 0.8902000188827515, 0.21570000052452087, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8392000198364258, 0.8353000283241272, 0.07840000092983246, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18039999902248383, 0.5960999727249146, 0.7922000288963318, 0.9961000084877014, 0.9961000084877014, 0.24709999561309814, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8392000198364258, 0.9961000084877014, 0.800000011920929, 0.7059000134468079, 0.7059000134468079, 0.7059000134468079, 0.7059000134468079, 0.7059000134468079, 0.9215999841690063, 0.9961000084877014, 0.9961000084877014, 0.9175999760627747, 0.6118000149726868, 0.03920000046491623, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3176000118255615, 0.8039000034332275, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9882000088691711, 0.9175999760627747, 0.4706000089645386, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10199999809265137, 0.8234999775886536, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.9961000084877014, 0.6000000238418579, 0.40779998898506165, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+		}
+	}]
+}
diff --git a/kubernetes/kserve/kserve_wrapper/README.md b/kubernetes/kserve/kserve_wrapper/README.md
index 54837b945d..dc86ca95f5 100644
--- a/kubernetes/kserve/kserve_wrapper/README.md
+++ b/kubernetes/kserve/kserve_wrapper/README.md
@@ -26,7 +26,7 @@ Follow the below steps to serve the MNIST Model :
 - Step 2 : Install KServe as below:
 
 ```bash
-pip install kserve>=0.9.0
+pip install kserve>=0.9.0 grpcio protobuf grpcio-tools
 ```
 
 - Step 4 : Run the Install Dependencies script
@@ -59,11 +59,11 @@ sudo  mkdir -p /mnt/models/model-store
 
 For v1 protocol
 
-``export TS_SERVICE_ENVELOPE=kserve`
+`export TS_SERVICE_ENVELOPE=kserve`
 
 For v2 protocol
 
-``export TS_SERVICE_ENVELOPE=kservev2`
+`export TS_SERVICE_ENVELOPE=kservev2`
 
 - Step 10: Move the config.properties to /mnt/models/config/.
   The config.properties file is as below :
@@ -93,6 +93,26 @@ torchserve --start --ts-config /mnt/models/config/config.properties
 
 - Step 12: Run the below command to start the KFServer
 
+- Step 13: Set protocol version
+
+For v1 protocol
+
+`export PROTOCOL_VERSION=v1`
+
+For v2 protocol
+
+`export PROTOCOL_VERSION=v2`
+
+For grpc protocol v2 format set
+
+`export PROTOCOL_VERSION=grpc-v2`
+
+- Generate python gRPC client stub using the proto files
+
+```bash
+python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
+```
+
 ```bash
 python3 serve/kubernetes/kserve/kserve_wrapper/__main__.py
 ```
@@ -127,7 +147,7 @@ Output:
 
 The curl request for explain is as below:
 
-```
+```bash
 curl -H "Content-Type: application/json" --data @serve/kubernetes/kserve/kf_request_json/v1/mnist.json http://0.0.0.0:8080/v1/models/mnist:explain
 ```
 
@@ -146,7 +166,7 @@ For v2 protocol
 The curl request for inference is as below:
 
 ```bash
-curl -H "Content-Type: application/json" --data @serve/kubernetes/kserve/kf_request_json/mnist_v2.json http://0.0.0.0:8080/v2/models/mnist/infer
+curl -H "Content-Type: application/json" --data @serve/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor.json http://0.0.0.0:8080/v2/models/mnist/infer
 ```
 
 Response:
@@ -167,29 +187,20 @@ Response:
 }
 ```
 
-The curl request for explain is as below:
+For grpc-v2 protocol
 
-```
-curl -H "Content-Type: application/json" --data @serve/kubernetes/kserve/kf_request_json/v1/mnist.json http://0.0.0.0:8080/v2/models/mnist/explain
+- Download the proto file
+
+```bash
+curl -O https://github.com/raw/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto
 ```
 
-Response:
+- Download [grpcurl](https://github.com/fullstorydev/grpcurl)
 
-```json
-{
-  "id": "3482b766-0483-40e9-84b0-8ce8d4d1576e",
-  "model_name": "mnist",
-  "model_version": "1.0",
-  "outputs": [{
-    "name": "explain",
-    "shape": [1, 28, 28],
-    "datatype": "FP64",
-    "data": [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0
-    ...
-    ...
-    ]
-  }]
-}
+Make gRPC request
+
+```bash
+grpcurl -vv -plaintext -proto grpc_predict_v2.proto -d @ localhost:8081 inference.GRPCInferenceService.ModelInfer <<< $(cat "serve/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor_grpc.json")
 ```
 
 ## KServe Wrapper Testing in Local for BERT
diff --git a/kubernetes/kserve/kserve_wrapper/TSModelRepository.py b/kubernetes/kserve/kserve_wrapper/TSModelRepository.py
index 0093e6cad3..4cb0fe47f5 100644
--- a/kubernetes/kserve/kserve_wrapper/TSModelRepository.py
+++ b/kubernetes/kserve/kserve_wrapper/TSModelRepository.py
@@ -14,7 +14,12 @@ class TSModelRepository(ModelRepository):
         as inputs to the TSModel Repository.
     """
 
-    def __init__(self, inference_address: str, management_address: str, model_dir: str):
+    def __init__(
+        self,
+        inference_address: str,
+        management_address: str,
+        model_dir: str,
+    ):
         """The Inference Address, Management Address and the Model Directory from the kserve
         side is initialized here.
 
diff --git a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
index b85aa52e01..352ff5ab67 100644
--- a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
+++ b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
@@ -2,20 +2,37 @@
     return a KServe side response """
 import logging
 import pathlib
+from enum import Enum
+from typing import Dict, Union
 
+import grpc
+import inference_pb2_grpc
 import kserve
+from gprc_utils import from_ts_grpc, to_ts_grpc
+from inference_pb2 import PredictionResponse
 from kserve.errors import ModelMissingError
 from kserve.model import Model as Model
+from kserve.protocol.grpc.grpc_predict_v2_pb2 import (
+    ModelInferRequest,
+    ModelInferResponse,
+)
+from kserve.protocol.infer_type import InferRequest, InferResponse
 from kserve.storage import Storage
 
 logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
 
 PREDICTOR_URL_FORMAT = PREDICTOR_V2_URL_FORMAT = "http://{0}/predictions/{1}"
-EXPLAINER_URL_FORMAT = EXPLAINER_V2_URL_FORMAT = "http://{0}/explanations/{1}"
+EXPLAINER_URL_FORMAT = EXPLAINER_v2_URL_FORMAT = "http://{0}/explanations/{1}"
 REGISTER_URL_FORMAT = "{0}/models?initial_workers=1&url={1}"
 UNREGISTER_URL_FORMAT = "{0}/models/{1}"
 
 
+class PredictorProtocol(Enum):
+    REST_V1 = "v1"
+    REST_V2 = "v2"
+    GRPC_V2 = "grpc-v2"
+
+
 class TorchserveModel(Model):
     """The torchserve side inference and explain end-points requests are handled to
     return a KServe side response
@@ -25,7 +42,15 @@ class TorchserveModel(Model):
         side predict and explain http requests.
     """
 
-    def __init__(self, name, inference_address, management_address, model_dir):
+    def __init__(
+        self,
+        name,
+        inference_address,
+        management_address,
+        grpc_inference_address,
+        protocol,
+        model_dir,
+    ):
         """The Model Name, Inference Address, Management Address and the model directory
         are specified.
 
@@ -45,10 +70,74 @@ def __init__(self, name, inference_address, management_address, model_dir):
         self.inference_address = inference_address
         self.management_address = management_address
         self.model_dir = model_dir
+        self.protocol = protocol
+
+        if self.protocol == PredictorProtocol.GRPC_V2.value:
+            self.predictor_host = grpc_inference_address
 
         logging.info("Predict URL set to %s", self.predictor_host)
-        self.explainer_host = self.predictor_host
         logging.info("Explain URL set to %s", self.explainer_host)
+        logging.info("Protocol version is %s", self.protocol)
+
+    def grpc_client(self):
+        if self._grpc_client_stub is None:
+            self.channel = grpc.aio.insecure_channel(self.predictor_host)
+            self.grpc_client_stub = inference_pb2_grpc.InferenceAPIsServiceStub(
+                self.channel
+            )
+        return self.grpc_client_stub
+
+    async def _grpc_predict(
+        self,
+        payload: Union[ModelInferRequest, InferRequest],
+        headers: Dict[str, str] = None,
+    ) -> ModelInferResponse:
+        """Overrides the `_grpc_predict` method in Model class. The predict method calls
+        the `_grpc_predict` method if the self.protocol is "grpc_v2"
+
+        Args:
+            request (Dict|InferRequest|ModelInferRequest): The response passed from ``predict`` handler.
+
+        Returns:
+            Dict: Torchserve grpc response.
+        """
+        payload = to_ts_grpc(payload)
+        grpc_stub = self.grpc_client()
+        async_result = await grpc_stub.Predictions(payload)
+        return async_result
+
+    def postprocess(
+        self,
+        response: Union[Dict, InferResponse, ModelInferResponse, PredictionResponse],
+        headers: Dict[str, str] = None,
+    ) -> Union[Dict, ModelInferResponse]:
+        """This method converts the v2 infer response types to gRPC or REST.
+        For gRPC request it converts InferResponse to gRPC message or directly returns ModelInferResponse from
+        predictor call or converts TS PredictionResponse to ModelInferResponse.
+        For REST request it converts ModelInferResponse to Dict or directly returns from predictor call.
+
+        Args:
+            response (Dict|InferResponse|ModelInferResponse|PredictionResponse): The response passed from ``predict`` handler.
+            headers (Dict): Request headers.
+
+        Returns:
+            Dict: post-processed response.
+        """
+        if headers:
+            if "grpc" in headers.get("user-agent", ""):
+                if isinstance(response, ModelInferResponse):
+                    return response
+                elif isinstance(response, InferResponse):
+                    return response.to_grpc()
+                elif isinstance(response, PredictionResponse):
+                    return from_ts_grpc(response)
+            if "application/json" in headers.get("content-type", ""):
+                # If the original request is REST, convert the gRPC predict response to dict
+                if isinstance(response, ModelInferResponse):
+                    return InferResponse.from_grpc(response).to_rest()
+                elif isinstance(response, InferResponse):
+                    return response.to_rest()
+        return response
 
     def load(self) -> bool:
         """This method validates model availabilty in the model directory
diff --git a/kubernetes/kserve/kserve_wrapper/__main__.py b/kubernetes/kserve/kserve_wrapper/__main__.py
index b31e3df375..f67e6de107 100644
--- a/kubernetes/kserve/kserve_wrapper/__main__.py
+++ b/kubernetes/kserve/kserve_wrapper/__main__.py
@@ -1,6 +1,7 @@
 """ KServe wrapper to handler inference in the kserve_predictor """
 import json
 import logging
+import os
 
 import kserve
 from kserve.model_server import ModelServer
@@ -10,9 +11,8 @@
 logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
 
 DEFAULT_MODEL_NAME = "model"
-DEFAULT_INFERENCE_ADDRESS = "http://127.0.0.1:8085"
-INFERENCE_PORT = "8085"
-DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8085"
+DEFAULT_INFERENCE_ADDRESS = DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8085"
+DEFAULT_GRPC_INFERENCE_PORT = "7070"
 
 DEFAULT_MODEL_STORE = "/mnt/models/model-store"
 CONFIG_PATH = "/mnt/models/config/config.properties"
@@ -41,57 +41,87 @@ def parse_config():
                 keys[name.strip()] = value.strip()
 
     keys["model_snapshot"] = json.loads(keys["model_snapshot"])
-    inference_address, management_address, model_store = (
+    inference_address, management_address, grpc_inference_port, model_store = (
         keys["inference_address"],
         keys["management_address"],
+        keys["grpc_inference_port"],
         keys["model_store"],
     )
 
     models = keys["model_snapshot"]["models"]
     model_names = []
 
-    # constructs inf address at a port other than 8080 as kfserver runs at 8080
-    if inference_address:
-        inf_splits = inference_address.split(":")
-        inference_address = inf_splits[0] + inf_splits[1] + ":" + INFERENCE_PORT
-    else:
-        inference_address = DEFAULT_INFERENCE_ADDRESS
     # Get all the model_names
     for model, value in models.items():
         model_names.append(model)
+
+    if not inference_address:
+        inference_address = DEFAULT_INFERENCE_ADDRESS
     if not model_names:
         model_names = [DEFAULT_MODEL_NAME]
+    if not inference_address:
+        inference_address = DEFAULT_INFERENCE_ADDRESS
     if not management_address:
         management_address = DEFAULT_MANAGEMENT_ADDRESS
+    inf_splits = inference_address.split(":")
+    if not grpc_inference_port:
+        grpc_inference_address = inf_splits[1] + ":" + DEFAULT_GRPC_INFERENCE_PORT
+    else:
+        grpc_inference_address = inf_splits[1] + ":" + grpc_inference_port
+    grpc_inference_address = grpc_inference_address.replace("/", "")
     if not model_store:
         model_store = DEFAULT_MODEL_STORE
+
     logging.info(
-        "Wrapper : Model names %s, inference address %s, management address %s, model store %s",
+        "Wrapper : Model names %s, inference address %s, management address %s, grpc_inference_address, %s, model store %s",
         model_names,
         inference_address,
         management_address,
+        grpc_inference_address,
         model_store,
     )
 
-    return model_names, inference_address, management_address, model_store
+    return (
+        model_names,
+        inference_address,
+        management_address,
+        grpc_inference_address,
+        model_store,
+    )
 
 
 if __name__ == "__main__":
-    model_names, inference_address, management_address, model_dir = parse_config()
+    (
+        model_names,
+        inference_address,
+        management_address,
+        grpc_inference_address,
+        model_dir,
+    ) = parse_config()
+
+    protocol = os.environ.get("PROTOCOL_VERSION")
 
     models = []
 
     for model_name in model_names:
         model = TorchserveModel(
-            model_name, inference_address, management_address, model_dir
+            model_name,
+            inference_address,
+            management_address,
+            grpc_inference_address,
+            protocol,
+            model_dir,
         )
         # By default model.load() is called on first request. Enabling load all
         # model in TS config.properties, all models are loaded at start and the
         # below method sets status to true for the models.
         model.load()
         models.append(model)
+
     registeredModels = TSModelRepository(
-        inference_address, management_address, model_dir
+        inference_address,
+        management_address,
+        model_dir,
     )
     ModelServer(
         registered_models=registeredModels,
diff --git a/kubernetes/kserve/kserve_wrapper/gprc_utils.py b/kubernetes/kserve/kserve_wrapper/gprc_utils.py
new file mode 100644
index 0000000000..c2693d3bc7
--- /dev/null
+++ b/kubernetes/kserve/kserve_wrapper/gprc_utils.py
@@ -0,0 +1,75 @@
+import base64
+import json
+from typing import Union
+
+from inference_pb2 import PredictionResponse, PredictionsRequest
+from kserve.errors import InvalidInput
+from kserve.protocol.grpc.grpc_predict_v2_pb2 import (
+    InferTensorContents,
+    ModelInferRequest,
+)
+from kserve.protocol.infer_type import InferOutput, InferRequest, InferResponse
+
+
+def get_content(datatype: str, data: InferTensorContents):
+    if datatype == "BOOL":
+        return list(data.bool_contents)
+    elif datatype in ["UINT8", "UINT16", "UINT32"]:
+        return list(data.uint_contents)
+    elif datatype == "UINT64":
+        return list(data.uint64_contents)
+    elif datatype in ["INT8", "INT16", "INT32"]:
+        return list(data.int_contents)
+    elif datatype == "INT64":
+        return list(data.int64_contents)
+    elif datatype == "FP32":
+        return list(data.fp32_contents)
+    elif datatype == "FP64":
+        return list(data.fp64_contents)
+    elif datatype == "BYTES":
+        return [base64.b64encode(data.bytes_contents[0]).decode("utf-8")]
+    else:
+        raise InvalidInput("invalid content type")
+
+
+def to_ts_grpc(data: Union[ModelInferRequest, InferRequest]) -> PredictionsRequest:
+    """Converts the InferRequest object to Torchserve gRPC PredictionsRequest message"""
+    if isinstance(data, InferRequest):
+        data = data.to_grpc()
+    infer_request = {}
+    model_name = data.model_name
+    infer_inputs = [
+        dict(
+            name=input_tensor.name,
+            shape=list(input_tensor.shape),
+            datatype=input_tensor.datatype,
+            data=get_content(input_tensor.datatype, input_tensor.contents),
+        )
+        for input_tensor in data.inputs
+    ]
+    infer_request["id"] = data.id
+    infer_request["inputs"] = infer_inputs
+    ts_grpc_input = {"data": json.dumps(infer_request).encode("utf-8")}
+    return PredictionsRequest(model_name=model_name, input=ts_grpc_input)
+
+
+def from_ts_grpc(data: PredictionResponse) -> InferResponse:
+    """Converts the Torchserve gRPC PredictionResponse object to InferResponse message"""
+    decoded_data = json.loads(data.prediction.decode("utf-8"))
+    infer_outputs = [
+        InferOutput(
+            name=output["name"],
+            shape=list(output["shape"]),
+            datatype=output["datatype"],
+            data=output["data"],
+        )
+        for output in decoded_data["outputs"]
+    ]
+    response_id = decoded_data.get("id")
+    infer_response = InferResponse(
+        model_name=decoded_data["model_name"],
+        response_id=response_id,
+        infer_outputs=infer_outputs,
+        from_grpc=True,
+    )
+    return infer_response.to_grpc()
diff --git a/kubernetes/kserve/requirements.txt b/kubernetes/kserve/requirements.txt
index d38cdf548e..9d1898d469 100644
--- a/kubernetes/kserve/requirements.txt
+++ b/kubernetes/kserve/requirements.txt
@@ -1,3 +1,6 @@
 kserve[storage]>=0.11.0
 transformers
 captum
+grpcio
+protobuf
+grpcio-tools
diff --git a/test/postman/kfv2_inference_data.json b/test/postman/kfv2_inference_data.json
index e00c715450..2a763876e9 100644
--- a/test/postman/kfv2_inference_data.json
+++ b/test/postman/kfv2_inference_data.json
@@ -6,7 +6,7 @@
    "file": "../kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor.json",
    "content-type": "application/json",
    "expected": {
-      "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"input-0","shape":[],"datatype":"INT64","data":[1]}]
+      "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"input-0","shape":[1],"datatype":"INT64","data":[1]}]
    },
    "expected_explain": {
       "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"input-0","shape":[1,28,28],"datatype":"FP64","data":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0040547527881586954,-0.00022612877132135935,-0.00012734132068921815,0.005648369123934234,0.00890478344415316,0.002638536746843638,0.0026802459473054567,-0.002657801646198628,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0002446577521584037,0.0008218454252870746,0.015285916556975589,0.007512832032495784,0.007094984582680408,0.003405668414819342,-0.0020919248349481525,-0.0007800296083653554,0.022995877395463753,0.019004328861537745,-0.0012529557611487667,-0.0014666116853554992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.005298396299742967,-0.0007901602589731957,0.00390606628994132,0.02317408192562863,0.01723791734244863,0.010867034230381416,0.003001563449593484,0.006224217749113618,0.006120711993702211,0.016736329208148985,0.005674718979287411,0.0043441351074201716,-0.0012328422456581033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0006867354470939666,0.009772898561731134,-0.003875493029617137,0.0017986933105143274,0.00130754408083684,-0.0024510981201440493,-0.0008806773035242951,0,0,-0.00014277890938077845,-0.009322312923101268,0.020608317831970053,0.0043513950202448085,-0.0007875567959471073,-0.0009075897498983682,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00022247236805959426,-0.0007829029576392685,0.0026663695298724034,0.000973336645392922,0,0,0,0,0,0,0,0.0004323206544010433,0.023657171718451487,0.010694845123018274,-0.0023759529649896504,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.002074797027562978,-0.0023201009712006193,-0.0012899209165390638,0,0,0,0,0,0,0,0,0,0.007629679307476711,0.010448627340902272,0.00025032896574585353,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0003770835815454417,-0.005156369326824804,0.0012477581647151723,0,0,0,0,0,0,0,0,0,-0.00004442522927758585,0.010248046478304183,0.0009971132925931643,0,0,0,0,0,0,0,0,0,0,0,0,0.0004501049686186689,-0.001963053861562753,-0.0006664790954766908,0.0020157404181157298,0,0,0,0,0,0,0,0,0,-0.0022144570001665495,0.008361584182210209,0.0031401945811928064,0,0,0,0,0,0,0,0,0,0,0,0,-0.0028943546389954404,-0.0031301382952544582,0.002113252627152244,0,0,0,0,0,0,0,0,0,0,-0.0010321050313140568,0.008905753962245818,0.0028464382842652274,0,0,0,0,0,0,0,0,0,0,0,0,-0.005305289160784239,-0.001927110161077484,0.0012090041616218117,0,0,0,0,0,0,0,0,0,0,-0.0011945155110826835,0.005654443253323257,0.0020132074296893847,0,0,0,0,0,0,0,0,0,0,0,0,-0.0014689358191145255,0.00107434126494373,0,0,0,0,0,0,0,0,0,0,0,-0.0017047979656755515,0.002906605326916773,-0.0007805868832212293,0,0,0,0,0,0,0,0,0,0,0,0.000055417251836277426,0.0014516115955483288,0.0002827699382308426,0,0,0,0,0,0,0,0,0,0,0,-0.0014401406798288333,0.002381249994012627,0.002146825485493657,0,0,0,0,0,0,0,0,0,0,0,0.0011500530011764514,0.00028650115062629793,0.0029798149728837,0,0,0,0,0,0,0,0,0,0,0,-0.0017750294246144378,0.0008339858039134471,-0.0037707389974128264,0,0,0,0,0,0,0,0,0,0,0,-0.0006093176702196316,-0.0004690580448827246,0.0034053215399203448,0,0,0,0,0,0,0,0,0,0,-0.0007450010561445004,0.0012987672807208413,-0.00849924754154327,-0.00006145174356975924,0,0,0,0,0,0,0,0,0,0,0,0.0011809727047705845,-0.0018384766530189604,0.005411106767295053,0,0,0,0,0,0,0,0,0,-0.0021392342405935397,0.0003259162378301207,-0.005276118419877435,-0.001950983939698961,-9.545680860124795e-7,0,0,0,0,0,0,0,0,0,0,0,0.000777240560389088,-0.00015179538793786839,0.006481484638650515,0,0,0,0,0,0,0,0,0.00008098065166629173,-0.0024904261335704243,-0.0020718616274916063,-0.00005341157801587443,-0.00045564727357325394,0,0,0,0,0,0,0,0,0,0,0,0,0,0.002275098238597264,0.0017164058060623701,0.00032213445581197173,0,0,0,0,0,-0.001556028266851665,0.0000910724863950236,0.0008772840524484654,0.000650298006504863,-0.004128780934527031,0.0006030386677594234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0013959957755626813,0.00267915270212672,0.0023995009632858484,-0.0004496094979322396,0.003101832911668704,0.007494535603697501,0.002864118701309854,-0.003052590375330078,0.003420222741405451,0.001492401842506996,-0.0009357391552120744,0.0007856228750089005,-0.00184339736789655,0.00001603187900317098,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0006999018662842894,0.004382251035718981,-0.0035419315151426845,-0.002889674705246964,-0.000487345313107622,-0.006087344960098864,0.0003882250941768635,0.0025336419028892817,-0.004352836272916637,-0.0006079418201851047,-0.003810133084711927,-0.0008284412435870998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0010901530193446261,-0.013135007265412056,0.000473452169279359,0.002050423312678761,-0.00660945214953636,0.00236478632058849,0.004678920566995346,-0.0018122525188342855,0.002137538293354298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}]
diff --git a/ts/torch_handler/request_envelope/kservev2.py b/ts/torch_handler/request_envelope/kservev2.py
index 5a88e9497d..d975c1a946 100644
--- a/ts/torch_handler/request_envelope/kservev2.py
+++ b/ts/torch_handler/request_envelope/kservev2.py
@@ -99,20 +99,28 @@ def _from_json(self, body_list):
         """
         Extracts the data from the JSON object
         """
-        # If the KF Transformer and Explainer sends in data as bytesarray
         if isinstance(body_list[0], (bytes, bytearray)):
-            body_list = [json.loads(body.decode()) for body in body_list]
+            body_list = [json.loads(body.decode("utf8")) for body in body_list]
             logger.debug("Bytes array is %s", body_list)
 
         input_names = []
         for index, input in enumerate(body_list[0]["inputs"]):
             if input["datatype"] == "BYTES":
                 body_list[0]["inputs"][index]["data"] = input["data"][0]
+            else:
+                body_list[0]["inputs"][index]["data"] = (
+                    np.array(input["data"]).reshape(tuple(input["shape"])).tolist()
+                )
             input_names.append(input["name"])
         setattr(self.context, "input_names", input_names)
         logger.debug("Bytes array is %s", body_list)
-        if body_list[0].get("id") is not None:
+        id = body_list[0].get("id")
+        if id and id.strip():
             setattr(self.context, "input_request_id", body_list[0]["id"])
+        # TODO: Add parameters support
+        # parameters = body_list[0].get("parameters")
+        # if parameters:
+        #     setattr(self.context, "input_parameters", body_list[0]["parameters"])
         data_list = [inputs_list.get("inputs") for inputs_list in body_list][0]
         return data_list
 
@@ -143,6 +151,10 @@ def format_output(self, data):
             delattr(self.context, "input_request_id")
         else:
             response["id"] = self.context.get_request_id(0)
+        # TODO: Add parameters support
+        # if hasattr(self.context, "input_parameters"):
+        #     response["parameters"] = getattr(self.context, "input_parameters")
+        #     delattr(self.context, "input_parameters")
         response["model_name"] = self.context.manifest.get("model").get("modelName")
         response["model_version"] = self.context.manifest.get("model").get(
             "modelVersion"
@@ -166,9 +178,9 @@ def _to_json(self, data, input_name):
         Constructs JSON object from data
         """
         output_data = {}
-        data_ndarray = np.array(data)
+        data_ndarray = np.array(data).flatten()
         output_data["name"] = input_name
-        output_data["shape"] = list(data_ndarray.shape)
         output_data["datatype"] = _to_datatype(data_ndarray.dtype)
-        output_data["data"] = data_ndarray.flatten().tolist()
+        output_data["data"] = data_ndarray.tolist()
+        output_data["shape"] = data_ndarray.flatten().shape
         return output_data
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 7618579767..c1a9c52841 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1068,10 +1068,11 @@ chatGPT
 baseimage
 cuDNN
 Xformer
+grpcurl
 ExamplePercentMetric
 HandlerMethodTime
 InferenceRequestCount
 PostprocessCallCount
 RequestBatchSize
 InitializeCallCount
-PreprocessCallCount
+PreprocessCallCount
\ No newline at end of file

From 448aad3a1d304e8b6e30c5998bed13dfb874dc9e Mon Sep 17 00:00:00 2001
From: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
Date: Thu, 24 Aug 2023 13:28:24 -0700
Subject: [PATCH 06/11] update versions for patch release (#2533)

Co-authored-by: Mark Saroufim <marksaroufim@fb.com>
---
 model-archiver/model_archiver/version.txt       | 2 +-
 ts/version.txt                                  | 2 +-
 workflow-archiver/workflow_archiver/version.txt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/model-archiver/model_archiver/version.txt b/model-archiver/model_archiver/version.txt
index 6f4eebdf6f..100435be13 100644
--- a/model-archiver/model_archiver/version.txt
+++ b/model-archiver/model_archiver/version.txt
@@ -1 +1 @@
-0.8.1
+0.8.2
diff --git a/ts/version.txt b/ts/version.txt
index 6f4eebdf6f..100435be13 100644
--- a/ts/version.txt
+++ b/ts/version.txt
@@ -1 +1 @@
-0.8.1
+0.8.2
diff --git a/workflow-archiver/workflow_archiver/version.txt b/workflow-archiver/workflow_archiver/version.txt
index 1866a362b7..13dead7ebf 100644
--- a/workflow-archiver/workflow_archiver/version.txt
+++ b/workflow-archiver/workflow_archiver/version.txt
@@ -1 +1 @@
-0.2.9
+0.2.10

From d47b14deecf6d8f8723ad764b5cc7c22c79c78bc Mon Sep 17 00:00:00 2001
From: Jagadeesh J <jagadeeshj@ideas2it.com>
Date: Fri, 25 Aug 2023 02:59:16 +0530
Subject: [PATCH 07/11] feat: add session affinity to k8s TS (#2519)

* feat: add session affinity to k8s TS

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* fix spell check

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

* fix docs

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>

---------

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
Co-authored-by: Geeta Chauhan <4461127+chauhang@users.noreply.github.com>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
---
 .pre-commit-config.yaml                   |  4 +-
 kubernetes/Helm/templates/torchserve.yaml |  6 ++-
 kubernetes/Helm/values.yaml               |  4 +-
 kubernetes/README.md                      | 46 +++++++++++++++++++++++
 kubernetes/destination_rule.yaml          | 13 +++++++
 kubernetes/gateway.yaml                   | 14 +++++++
 kubernetes/virtual_service.yaml           | 36 ++++++++++++++++++
 ts_scripts/spellcheck_conf/wordlist.txt   |  3 ++
 ts_scripts/torchserve_grpc_client.py      | 17 +++++----
 9 files changed, 132 insertions(+), 11 deletions(-)
 create mode 100644 kubernetes/destination_rule.yaml
 create mode 100644 kubernetes/gateway.yaml
 create mode 100644 kubernetes/virtual_service.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7af6034e2b..78603bdafb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,12 +25,12 @@ repos:
       - id: python-no-log-warn
       - id: python-use-type-annotations
   - repo: https://github.com/hadialqattan/pycln
-    rev: v2.1.3
+    rev: v2.1.5
     hooks:
       - id: pycln
         args: [--all]
   - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 23.7.0
     hooks:
       - id: black
         additional_dependencies: ['click==8.0.4']
diff --git a/kubernetes/Helm/templates/torchserve.yaml b/kubernetes/Helm/templates/torchserve.yaml
index 71cecfb56b..642dca4c36 100644
--- a/kubernetes/Helm/templates/torchserve.yaml
+++ b/kubernetes/Helm/templates/torchserve.yaml
@@ -20,7 +20,9 @@ spec:
   - name: metrics
     port: {{ .Values.torchserve.metrics_port }}
     targetPort: ts-metrics
-  type: LoadBalancer
+  - name: grpc
+    port: {{ .Values.torchserve.grpc_inference_port }}
+    targetPort: ts-grpc
   selector:
     app: torchserve
 ---
@@ -55,6 +57,8 @@ spec:
           containerPort: {{ .Values.torchserve.management_port }}
         - name: ts-metrics
           containerPort: {{ .Values.torchserve.metrics_port }}
+        - name: ts-grpc
+          containerPort: {{ .Values.torchserve.grpc_inference_port }}
         imagePullPolicy: IfNotPresent
         volumeMounts:
           - mountPath: {{ .Values.torchserve.pvd_mount }}
diff --git a/kubernetes/Helm/values.yaml b/kubernetes/Helm/values.yaml
index fb74a4277c..cd8dbc81ac 100644
--- a/kubernetes/Helm/values.yaml
+++ b/kubernetes/Helm/values.yaml
@@ -8,13 +8,15 @@ torchserve:
   management_port: 8081
   inference_port: 8080
   metrics_port: 8082
+  grpc_inference_port: 7070
+
   pvd_mount: /home/model-server/shared/
   n_gpu: 4
   n_cpu: 16
   memory_limit: 32Gi
 
 deployment:
-  replicas: 1
+  replicas: 2
 
 persistentVolume:
   name: efs-claim
diff --git a/kubernetes/README.md b/kubernetes/README.md
index 9575499cea..6e5bd6678c 100644
--- a/kubernetes/README.md
+++ b/kubernetes/README.md
@@ -53,6 +53,7 @@ torchserve:
   management_port: 8081
   inference_port: 8080
   metrics_port: 8082
+  grpc_inference_port: 7070
   pvd_mount: /home/model-server/shared/
   n_gpu: 1
   n_cpu: 1
@@ -290,6 +291,51 @@ Follow the link for log aggregation with EFK Stack.\
 ## Autoscaling
   [Autoscaling with torchserve metrics](autoscale.md)
 
+## Session Affinity with Multiple Torchserve pods
+
+### Pre-requisites
+
+ - Follow the instructions above and deploy Torchserve with more than 1 replica to the kubernetes cluster
+ - Download Istio and add to path as shown [here](https://istio.io/latest/docs/setup/getting-started/#download)
+ - Install Istio with below command
+   - `istioctl install --set meshConfig.accessLogFile=/dev/stdout`
+
+### Steps
+
+Now we have multiple replicas of Torchserve running and istio installed. We can apply gateway, virtual service and destination rule to enable session affinity to the user requests.
+
+ - Apply the istio gateway via `kubectl apply -f gateway.yaml`
+   - This gateway exposes all the host behind it via port 80 as defined in the yaml file.
+ - Apply the virtual service with command `kubectl apply -f virtual_service.yaml`
+   - This with look for header named `protocol` in the incoming request and forward the request to Torchserve service. If the `protocol` header has a value `rest` then the request is forwarded to port `8080` of Torchserve service and if the `protocol` header has a value `grpc` then the request is forwarded to port `7070` for Torchserve service.
+ - Apply the destination Rule using the command `kubectl apply -f destination_rule.yaml`.
+   - The destination rule look for a http cookie with a key `session_id`. The request with `session_id` is served by the same pod that served the previous request with the same `session_id`
+
+### HTTP Inference
+
+- Fetch the external IP from istio-ingress gateway using the below command
+
+```bash
+ubuntu@ubuntu$ kubectl get svc -n istio-system
+NAME                   TYPE           CLUSTER-IP      EXTERNAL-IP                                                               PORT(S)                                                   AGE
+istio-ingressgateway   LoadBalancer   10.100.84.243   a918b2zzzzzzzzzzzzzzzzzzzzzz-1466623565.us-west-2.elb.amazonaws.com   15021:32270/TCP,80:31978/TCP,443:31775/TCP,70:31778/TCP   2d6h
+```
+
+- Make Request as shown below
+
+```bash
+curl -v -H "protocol: REST" --cookie "session_id="12345" http://a918b2d70dbddzzzzzzzzzzz49ec8cf03b-1466623565.us-west-2.elb.amazonaws.com:80/predictions/<model_name> -d "data=<input-string>"
+```
+
+### gRPC Inference
+
+- Refer [grpc_api](../docs/grpc_api.md) to generate python files and run
+
+```bash
+python ts_scripts/torchserve_grpc_client.py infer <model_name> <input-string>
+```
+
+
 ## Roadmap
 
 * [] Log / Metrics Aggregation using [AWS Container Insights](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights.html)
diff --git a/kubernetes/destination_rule.yaml b/kubernetes/destination_rule.yaml
new file mode 100644
index 0000000000..b334fa4106
--- /dev/null
+++ b/kubernetes/destination_rule.yaml
@@ -0,0 +1,13 @@
+apiVersion: networking.istio.io/v1alpha3
+kind: DestinationRule
+metadata:
+  name: torchserve-dr
+spec:
+  host: torchserve.default.svc.cluster.local # <ts-service-name>.<namespace>.svc.cluster.local
+  trafficPolicy:
+    loadBalancer:
+      consistentHash:
+        # httpHeaderName: x-user
+        httpCookie:
+          name: session_id
+          ttl: 60s
diff --git a/kubernetes/gateway.yaml b/kubernetes/gateway.yaml
new file mode 100644
index 0000000000..b2ecfca23b
--- /dev/null
+++ b/kubernetes/gateway.yaml
@@ -0,0 +1,14 @@
+apiVersion: networking.istio.io/v1beta1
+kind: Gateway
+metadata:
+  name: torchserve-gw
+spec:
+  selector:
+    istio: ingressgateway
+  servers:
+  - hosts:
+    - '*'
+    port:
+      name: http
+      number: 80
+      protocol: HTTP
diff --git a/kubernetes/virtual_service.yaml b/kubernetes/virtual_service.yaml
new file mode 100644
index 0000000000..889f6c0d22
--- /dev/null
+++ b/kubernetes/virtual_service.yaml
@@ -0,0 +1,36 @@
+apiVersion: networking.istio.io/v1alpha3
+kind: VirtualService
+metadata:
+  name: torchserve-vs
+spec:
+  hosts:
+    - "*"
+  gateways:
+    - torchserve-gw
+  http:
+    - match:
+        - uri:
+            prefix: /metrics
+      route:
+        - destination:
+            host: torchserve.default.svc.cluster.local
+            port:
+              number: 8082
+    - match:
+        - headers:
+            protocol:
+              exact: REST
+      route:
+        - destination:
+            host: torchserve.default.svc.cluster.local # <ts-service-name>.<namespace>.svc.cluster.local
+            port:
+              number: 8080
+    - match:
+        - headers:
+            protocol:
+              exact: gRPC
+      route:
+        - destination:
+            host: torchserve.default.svc.cluster.local # <ts-service-name>.<namespace>.svc.cluster.local
+            port:
+              number: 7070
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index c1a9c52841..9f6e6e8aab 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1065,6 +1065,9 @@ ActionSLAM
 statins
 ci
 chatGPT
+accessLogFile
+istioctl
+meshConfig
 baseimage
 cuDNN
 Xformer
diff --git a/ts_scripts/torchserve_grpc_client.py b/ts_scripts/torchserve_grpc_client.py
index ccf293ed3f..a1868884c1 100644
--- a/ts_scripts/torchserve_grpc_client.py
+++ b/ts_scripts/torchserve_grpc_client.py
@@ -19,13 +19,14 @@ def get_management_stub():
     return stub
 
 
-def infer(stub, model_name, model_input):
+def infer(stub, model_name, model_input, metadata):
     with open(model_input, "rb") as f:
         data = f.read()
 
     input_data = {"data": data}
     response = stub.Predictions(
-        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data)
+        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data),
+        metadata=metadata,
     )
 
     try:
@@ -35,13 +36,14 @@ def infer(stub, model_name, model_input):
         exit(1)
 
 
-def infer_stream(stub, model_name, model_input):
+def infer_stream(stub, model_name, model_input, metadata):
     with open(model_input, "rb") as f:
         data = f.read()
 
     input_data = {"data": data}
     responses = stub.StreamPredictions(
-        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data)
+        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data),
+        metadata=metadata,
     )
 
     try:
@@ -92,7 +94,6 @@ def unregister(stub, model_name):
 
 
 if __name__ == "__main__":
-
     parent_parser = argparse.ArgumentParser(add_help=False)
     parent_parser.add_argument(
         "model_name",
@@ -141,10 +142,12 @@ def unregister(stub, model_name):
 
     args = parser.parse_args()
 
+    metadata = (("protocol", "gRPC"), ("session_id", "12345"))
+
     if args.action == "infer":
-        infer(get_inference_stub(), args.model_name, args.model_input)
+        infer(get_inference_stub(), args.model_name, args.model_input, metadata)
     elif args.action == "infer_stream":
-        infer_stream(get_inference_stub(), args.model_name, args.model_input)
+        infer_stream(get_inference_stub(), args.model_name, args.model_input, metadata)
     elif args.action == "register":
         register(get_management_stub(), args.model_name, args.mar_set)
     elif args.action == "unregister":

From 03ad862e77267fbeccd9f04551ec521cecc29ba7 Mon Sep 17 00:00:00 2001
From: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
Date: Thu, 24 Aug 2023 15:24:31 -0700
Subject: [PATCH 08/11] Profile TorchServe Handler (preprocess vs inference vs
 post-process) (#2470)

* Profile TS Handler using ab tool

* Added an example

* Added an example

* handler class not needed

* Add model_yaml_config to MockCOntext

* remove unnecessary config

* based on review comments

* Added details on how to enable this

* Added details on how to enable this

* lint fix

* lint fix

* lint fix

---------

Co-authored-by: Geeta Chauhan <4461127+chauhang@users.noreply.github.com>
---
 benchmarks/benchmark-ab.py                    | 22 +++++++
 benchmarks/utils/gen_model_config_json.py     | 43 ++++++++----
 examples/benchmarking/resnet50/README.md      | 45 +++++++++++++
 .../resnet50/benchmark_profile.yaml           | 16 +++++
 .../benchmarking/resnet50/model-config.yaml   |  2 +
 examples/benchmarking/resnet50/model.py       |  6 ++
 examples/benchmarking/resnet50/resnet50.yaml  | 24 +++++++
 ts/handler_utils/timer.py                     | 66 +++++++++++++++++++
 ts/torch_handler/base_handler.py              |  8 ++-
 ts/torch_handler/image_classifier.py          | 20 +++---
 .../unit_tests/test_utils/mock_context.py     |  7 ++
 ts/torch_handler/vision_handler.py            |  3 +
 12 files changed, 239 insertions(+), 23 deletions(-)
 create mode 100644 examples/benchmarking/resnet50/README.md
 create mode 100644 examples/benchmarking/resnet50/benchmark_profile.yaml
 create mode 100644 examples/benchmarking/resnet50/model-config.yaml
 create mode 100644 examples/benchmarking/resnet50/model.py
 create mode 100644 examples/benchmarking/resnet50/resnet50.yaml
 create mode 100644 ts/handler_utils/timer.py

diff --git a/benchmarks/benchmark-ab.py b/benchmarks/benchmark-ab.py
index ebe48ea50a..a2a609c4e9 100644
--- a/benchmarks/benchmark-ab.py
+++ b/benchmarks/benchmark-ab.py
@@ -30,6 +30,7 @@
     "image": "",
     "docker_runtime": "",
     "backend_profiling": False,
+    "handler_profiling": False,
     "generate_graphs": False,
     "config_properties": "config.properties",
     "inference_model_url": "predictions/benchmark",
@@ -95,6 +96,12 @@ def json_provider(file_path, cmd_name):
     default=False,
     help="Enable backend profiling using CProfile. Default False",
 )
+@click.option(
+    "--handler_profiling",
+    "-hp",
+    default=False,
+    help="Enable handler profiling. Default False",
+)
 @click.option(
     "--generate_graphs",
     "-gg",
@@ -143,6 +150,7 @@ def benchmark(
     image,
     docker_runtime,
     backend_profiling,
+    handler_profiling,
     config_properties,
     inference_model_url,
     report_location,
@@ -163,6 +171,7 @@ def benchmark(
         "image": image,
         "docker_runtime": docker_runtime,
         "backend_profiling": backend_profiling,
+        "handler_profiling": handler_profiling,
         "config_properties": config_properties,
         "inference_model_url": inference_model_url,
         "report_location": report_location,
@@ -469,6 +478,17 @@ def generate_report(warm_up_lines):
 }
 
 
+def update_metrics():
+    if execution_params["handler_profiling"]:
+        opt_metrics = {
+            "handler_preprocess.txt": "ts_handler_preprocess",
+            "handler_inference.txt": "ts_handler_inference",
+            "handler_postprocess.txt": "ts_handler_postprocess",
+        }
+        metrics.update(opt_metrics)
+    return metrics
+
+
 def extract_metrics(warm_up_lines):
     with open(execution_params["metric_log"]) as f:
         lines = f.readlines()
@@ -476,6 +496,8 @@ def extract_metrics(warm_up_lines):
     click.secho(f"Dropping {warm_up_lines} warmup lines from log", fg="green")
     lines = lines[warm_up_lines:]
 
+    metrics = update_metrics()
+
     for k, v in metrics.items():
         all_lines = []
         pattern = re.compile(v)
diff --git a/benchmarks/utils/gen_model_config_json.py b/benchmarks/utils/gen_model_config_json.py
index 6b963e13f0..b9534934a1 100644
--- a/benchmarks/utils/gen_model_config_json.py
+++ b/benchmarks/utils/gen_model_config_json.py
@@ -2,11 +2,11 @@
 import copy
 import json
 import os
+
 import yaml
 
 
 def main():
-
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -22,6 +22,7 @@ def main():
     arguments = parser.parse_args()
     convert_yaml_to_json(arguments.input, arguments.output)
 
+
 MODEL_CONFIG_KEY = {
     "batch_size",
     "batch_delay",
@@ -30,12 +31,18 @@ def main():
     "concurrency",
     "workers",
     "input",
-    "processors"
+    "processors",
+    "handler_profiling",
 }
 
+
 def convert_yaml_to_json(yaml_file_path, output_dir):
-    print("convert_yaml_to_json yaml_file_path={}, output_dir={}".format(yaml_file_path, output_dir))
-    with open(yaml_file_path, 'r') as f:
+    print(
+        "convert_yaml_to_json yaml_file_path={}, output_dir={}".format(
+            yaml_file_path, output_dir
+        )
+    )
+    with open(yaml_file_path, "r") as f:
         yaml_dict = yaml.safe_load(f)
 
         for model, config in yaml_dict.items():
@@ -58,10 +65,9 @@ def convert_yaml_to_json(yaml_file_path, output_dir):
                 batch_worker_list = []
                 for batch_size in batch_size_list:
                     for workers in workers_list:
-                        batch_worker_list.append({
-                            "batch_size" : batch_size,
-                            "workers" : workers
-                        })
+                        batch_worker_list.append(
+                            {"batch_size": batch_size, "workers": workers}
+                        )
 
                 benchmark_configs = []
                 for batch_worker in batch_worker_list:
@@ -72,25 +78,34 @@ def convert_yaml_to_json(yaml_file_path, output_dir):
                 for bConfig in benchmark_configs:
                     for i in range(len(processors)):
                         if type(processors[i]) is str:
-                            path = '{}/{}'.format(output_dir, processors[i])
+                            path = "{}/{}".format(output_dir, processors[i])
                             if not os.path.isdir(path):
                                 continue
 
-                            benchmark_config_file = '{}/{}_w{}_b{}.json'\
-                                .format(path, model_name, bConfig["workers"], bConfig["batch_size"])
+                            benchmark_config_file = "{}/{}_w{}_b{}.json".format(
+                                path,
+                                model_name,
+                                bConfig["workers"],
+                                bConfig["batch_size"],
+                            )
                             with open(benchmark_config_file, "w") as outfile:
                                 json.dump(bConfig, outfile, indent=4)
                         elif type(processors[i]) is dict:
-                            path = '{}/gpu'.format(output_dir)
+                            path = "{}/gpu".format(output_dir)
                             if not os.path.isdir(path):
                                 continue
 
                             bConfig["gpus"] = processors[i]["gpus"]
-                            benchmark_config_file = '{}/{}_w{}_b{}.json'\
-                                .format(path, model_name, bConfig["workers"], bConfig["batch_size"])
+                            benchmark_config_file = "{}/{}_w{}_b{}.json".format(
+                                path,
+                                model_name,
+                                bConfig["workers"],
+                                bConfig["batch_size"],
+                            )
                             with open(benchmark_config_file, "w") as outfile:
                                 json.dump(bConfig, outfile, indent=4)
                             del bConfig["gpus"]
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/benchmarking/resnet50/README.md b/examples/benchmarking/resnet50/README.md
new file mode 100644
index 0000000000..39721f6954
--- /dev/null
+++ b/examples/benchmarking/resnet50/README.md
@@ -0,0 +1,45 @@
+
+# Benchmark ResNet50 and profile the detailed split of PredictionTime
+
+This example shows how to run the benchmark ab tool on ResNet50 and identify the time spent on preprocess, inference and postprocess
+
+Change directory to the root of `serve`
+Ex: if `serve` is under `/home/ubuntu`, change directory to `/home/ubuntu/serve`
+
+
+## Download the weights
+
+```
+wget https://download.pytorch.org/models/resnet50-11ad3fa6.pth
+```
+
+### Create model archive
+
+To enable profiling of TorchServe Handler, add the following config in model-config.yaml
+```
+handler:
+  profile: true
+```
+
+```
+torch-model-archiver --model-name resnet-50 --version 1.0 --model-file ./examples/benchmarking/resnet50/model.py --serialized-file resnet50-11ad3fa6.pth --handler image_classifier  --extra-files ./examples/image_classifier/index_to_name.json --config-file ./examples/benchmarking/resnet50/model-config.yaml
+
+mkdir model_store
+mv resnet-50.mar model_store/.
+```
+
+### Install dependencies for benchmark tool
+
+```
+sudo apt-get update -y
+sudo apt-get install -y apache2-utils
+pip install -r benchmarks/requirements-ab.txt
+```
+
+### Run ab tool for benchmarking
+
+```
+python benchmarks/auto_benchmark.py --input examples/benchmarking/resnet50/benchmark_profile.yaml --skip true
+```
+
+This generates the report under `/tmp/ts_benchmarking/report.md`
diff --git a/examples/benchmarking/resnet50/benchmark_profile.yaml b/examples/benchmarking/resnet50/benchmark_profile.yaml
new file mode 100644
index 0000000000..eb2d57e204
--- /dev/null
+++ b/examples/benchmarking/resnet50/benchmark_profile.yaml
@@ -0,0 +1,16 @@
+# Torchserve version is to be installed. It can be one of the options
+#  - branch : "master"
+#  - nightly: "2022.3.16"
+#  - release: "0.5.3"
+# Nightly build will be installed if "ts_version" is not specifiged
+#ts_version:
+#    branch: &ts_version "master"
+
+# a list of model configure yaml files defined in benchmarks/models_config
+# or a list of model configure yaml files with full path
+models:
+    - "/home/ubuntu/serve/examples/benchmarking/resnet50/resnet50.yaml"
+
+# benchmark on "cpu" or "gpu".
+# "cpu" is set if "hardware" is not specified
+hardware: &hardware "gpu"
diff --git a/examples/benchmarking/resnet50/model-config.yaml b/examples/benchmarking/resnet50/model-config.yaml
new file mode 100644
index 0000000000..a8cbf248c4
--- /dev/null
+++ b/examples/benchmarking/resnet50/model-config.yaml
@@ -0,0 +1,2 @@
+handler:
+  profile: true
diff --git a/examples/benchmarking/resnet50/model.py b/examples/benchmarking/resnet50/model.py
new file mode 100644
index 0000000000..ac61782d3a
--- /dev/null
+++ b/examples/benchmarking/resnet50/model.py
@@ -0,0 +1,6 @@
+from torchvision.models.resnet import Bottleneck, ResNet
+
+
+class ImageClassifier(ResNet):
+    def __init__(self):
+        super(ImageClassifier, self).__init__(Bottleneck, [3, 4, 6, 3])
diff --git a/examples/benchmarking/resnet50/resnet50.yaml b/examples/benchmarking/resnet50/resnet50.yaml
new file mode 100644
index 0000000000..2f97e0a8ca
--- /dev/null
+++ b/examples/benchmarking/resnet50/resnet50.yaml
@@ -0,0 +1,24 @@
+---
+resnet50:
+    eager_mode:
+        benchmark_engine: "ab"
+        url: "file:///home/ubuntu/serve/model_store/resnet-50.mar"
+        workers:
+            - 4
+        batch_delay: 100
+        batch_size:
+            - 1
+            - 2
+            - 4
+            - 8
+            - 16
+            - 32
+            - 64
+        requests: 10000
+        concurrency: 100
+        input: "./examples/image_classifier/kitten.jpg"
+        handler_profiling: true
+        exec_env: "local"
+        processors:
+            - "cpu"
+            - "gpus": "all"
diff --git a/ts/handler_utils/timer.py b/ts/handler_utils/timer.py
new file mode 100644
index 0000000000..a747eea3c2
--- /dev/null
+++ b/ts/handler_utils/timer.py
@@ -0,0 +1,66 @@
+"""
+Decorator for timing handler methods
+
+Use this decorator to compute the execution time for your preprocesss, inference and
+postprocess methods.
+By default this feature is not enabled.
+
+To enable this, add the following section in your model-config.yaml file
+
+handler:
+  profile: true
+
+An example of running benchmarks with the profiling enabled is in
+https://github.com/pytorch/serve/tree/master/examples/benchmarking/resnet50
+
+"""
+
+import time
+
+import torch
+
+
+def timed(func):
+    def wrap_func(self, *args, **kwargs):
+        # Measure time if config specified in model_yaml_config
+        if (
+            "handler" in self.context.model_yaml_config
+            and "profile" in self.context.model_yaml_config["handler"]
+        ):
+            if self.context.model_yaml_config["handler"]["profile"]:
+                # Measure start time
+                if torch.cuda.is_available():
+                    start = torch.cuda.Event(enable_timing=True)
+                    end = torch.cuda.Event(enable_timing=True)
+                    start.record()
+                else:
+                    start = time.time()
+
+                result = func(self, *args, **kwargs)
+
+                # Measure end time
+                if torch.cuda.is_available():
+                    end.record()
+                    torch.cuda.synchronize()
+                else:
+                    end = time.time()
+
+                # Measure time taken to execute the function in miliseconds
+                if torch.cuda.is_available():
+                    duration = start.elapsed_time(end)
+                else:
+                    duration = (end - start) * 1000
+
+                # Add metrics for profiling
+                metrics = self.context.metrics
+                metrics.add_time("ts_handler_" + func.__name__, duration)
+            else:
+                # If profile config specified in model_yaml_config is False
+                result = func(self, *args, **kwargs)
+        else:
+            # If no profile config specified in model_yaml_config
+            result = func(self, *args, **kwargs)
+
+        return result
+
+    return wrap_func
diff --git a/ts/torch_handler/base_handler.py b/ts/torch_handler/base_handler.py
index 2e3e716a6b..227a4ec56c 100644
--- a/ts/torch_handler/base_handler.py
+++ b/ts/torch_handler/base_handler.py
@@ -12,6 +12,8 @@
 import torch
 from pkg_resources import packaging
 
+from ts.handler_utils.timer import timed
+
 from ..utils.util import (
     check_valid_pt2_backend,
     list_classes_from_module,
@@ -77,7 +79,8 @@
     ONNX_AVAILABLE = False
 
 try:
-    import torch_tensorrt
+    import torch_tensorrt  # nopycln: import
+
     logger.info("Torch TensorRT enabled")
 except ImportError:
     logger.warning("Torch TensorRT not enabled")
@@ -265,6 +268,7 @@ def _load_pickled_model(self, model_dir, model_file, model_pt_path):
             model.load_state_dict(state_dict)
         return model
 
+    @timed
     def preprocess(self, data):
         """
         Preprocess function to convert the request input to a tensor(Torchserve supported format).
@@ -279,6 +283,7 @@ def preprocess(self, data):
 
         return torch.as_tensor(data, device=self.device)
 
+    @timed
     def inference(self, data, *args, **kwargs):
         """
         The Inference Function is used to make a prediction call on the given input request.
@@ -296,6 +301,7 @@ def inference(self, data, *args, **kwargs):
             results = self.model(marshalled_data, *args, **kwargs)
         return results
 
+    @timed
     def postprocess(self, data):
         """
         The post process function makes use of the output from the inference and converts into a
diff --git a/ts/torch_handler/image_classifier.py b/ts/torch_handler/image_classifier.py
index f43eac5e6f..ef194d3924 100644
--- a/ts/torch_handler/image_classifier.py
+++ b/ts/torch_handler/image_classifier.py
@@ -5,8 +5,10 @@
 import torch.nn.functional as F
 from torchvision import transforms
 
+from ts.handler_utils.timer import timed
+
+from ..utils.util import map_class_to_label
 from .vision_handler import VisionHandler
-from ..utils.util  import map_class_to_label
 
 
 class ImageClassifier(VisionHandler):
@@ -18,13 +20,14 @@ class ImageClassifier(VisionHandler):
     topk = 5
     # These are the standard Imagenet dimensions
     # and statistics
-    image_processing = transforms.Compose([
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                             std=[0.229, 0.224, 0.225])
-    ])
+    image_processing = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
 
     def set_max_result_classes(self, topk):
         self.topk = topk
@@ -32,6 +35,7 @@ def set_max_result_classes(self, topk):
     def get_max_result_classes(self):
         return self.topk
 
+    @timed
     def postprocess(self, data):
         ps = F.softmax(data, dim=1)
         probs, classes = torch.topk(ps, self.topk, dim=1)
diff --git a/ts/torch_handler/unit_tests/test_utils/mock_context.py b/ts/torch_handler/unit_tests/test_utils/mock_context.py
index 4ee1aeb4ec..287074c6eb 100644
--- a/ts/torch_handler/unit_tests/test_utils/mock_context.py
+++ b/ts/torch_handler/unit_tests/test_utils/mock_context.py
@@ -21,6 +21,7 @@ def __init__(
         model_file="model.py",
         gpu_id="0",
         model_name="mnist",
+        model_yaml_config_file=None,
     ):
         self.manifest = {"model": {}}
         if model_pt_file:
@@ -36,6 +37,12 @@ def __init__(
 
         self.explain = False
         self.metrics = MetricsStore(uuid.uuid4(), model_name)
+        self.model_yaml_config = {}
+
+        if model_yaml_config_file:
+            self.model_yaml_config = get_yaml_config(
+                os.path.join(model_dir, model_yaml_config_file)
+            )
 
     def get_request_header(self, idx, exp):
         if idx and exp:
diff --git a/ts/torch_handler/vision_handler.py b/ts/torch_handler/vision_handler.py
index 0ad08af327..9d7778b41d 100644
--- a/ts/torch_handler/vision_handler.py
+++ b/ts/torch_handler/vision_handler.py
@@ -11,6 +11,8 @@
 from captum.attr import IntegratedGradients
 from PIL import Image
 
+from ts.handler_utils.timer import timed
+
 from .base_handler import BaseHandler
 
 
@@ -27,6 +29,7 @@ def initialize(self, context):
         if not properties.get("limit_max_image_pixels"):
             Image.MAX_IMAGE_PIXELS = None
 
+    @timed
     def preprocess(self, data):
         """The preprocess function of MNIST program converts the input data to a float tensor
 

From 1a61d2c1880ac2897b2843c7aef0a8286d67449c Mon Sep 17 00:00:00 2001
From: Ethan Kim <47581967+ethankim00@users.noreply.github.com>
Date: Thu, 24 Aug 2023 18:45:51 -0400
Subject: [PATCH 09/11] fix typos (#2536)

Co-authored-by: Ethan Kim <ethankim@moonhub.ai>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
---
 ts/context.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ts/context.py b/ts/context.py
index 82a7e352d9..aa5d9babda 100644
--- a/ts/context.py
+++ b/ts/context.py
@@ -8,7 +8,7 @@
 class Context(object):
     """
     Context stores model relevant worker information
-    Some fixed during load times and some
+    Some fixed during load times and some set by the service
     """
 
     def __init__(
@@ -39,7 +39,7 @@ def __init__(
         self._limit_max_image_pixels = True
         self.metrics = metrics
         self.model_yaml_config = model_yaml_config
-        # add cient socket variable cl_socket to be used for send_intermediate_predict_response
+        # add client socket variable cl_socket to be used for send_intermediate_predict_response
         self.cl_socket = None
 
     @property

From 2a386ec8f81ae8cc0c79be6aaf35b1c8118464c5 Mon Sep 17 00:00:00 2001
From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com>
Date: Thu, 24 Aug 2023 19:08:03 -0400
Subject: [PATCH 10/11] Adding Performance Checklist to site (#2526)

* Adding Performance Checklist to site

Adding performance checklist to site.
Plan to add this checklist to FAQs page in the future as well. For now, it can be accessed through the performance page.

* Spelling update

* Update wordlist.txt

* Update performance_checklist.md

* Update README.md

---------

Co-authored-by: Mark Saroufim <marksaroufim@fb.com>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
---
 docs/performance_checklist.md           | 38 +++++++++++++++++++++++++
 docs/performance_guide.md               |  6 ++++
 kubernetes/AKS/README.md                | 10 +++----
 ts_scripts/spellcheck_conf/wordlist.txt | 13 +++++++++
 4 files changed, 62 insertions(+), 5 deletions(-)
 create mode 100644 docs/performance_checklist.md

diff --git a/docs/performance_checklist.md b/docs/performance_checklist.md
new file mode 100644
index 0000000000..d984ed37e4
--- /dev/null
+++ b/docs/performance_checklist.md
@@ -0,0 +1,38 @@
+# Model Inference Optimization Checklist
+
+This checklist describes some steps that should be completed when diagnosing model inference performance issues.  Some of these suggestions are only applicable to NLP models (e.g., ensuring the input is not over-padded and sequence bucketing), but the general principles are useful for other models too.
+
+## General System Optimizations
+
+- Check the versions of PyTorch, Nvidia driver, and other components and update to the latest compatible releases.  Oftentimes known performance bugs have already been fixed.
+
+- Collect system-level activity logs to understand the overall resource utilizations. It’s useful to know how the model inference pipeline is using the system resources at a high level, as the first step of optimization.  Even simple CLI tools such as nvidia-smi and htop would be helpful.
+
+- Start with a target with the highest impact on performance.  It should be obvious from the system activity logs where the biggest bottleneck is – look beyond model inference, as pre/post processing can be expensive and can affect the end-to-end throughput just as much.
+
+- Quantify and mitigate the influence of slow I/O such as disk and network on end-to-end performance.  While optimizing I/O is out of scope for this checklist, look for techniques that use async, concurrency, pipelining, etc. to effectively “hide” the cost of I/O.
+
+- For model inference on input sequences of dynamic length (e.g., transformers for NLP), make sure the tokenizer is not over-padding the input.  If a transformer was trained with padding to a constant length (e.g., 512) and deployed with the same padding, it would run unnecessarily slow (orders of magnitude) on short sequences.
+
+- Vision models with input in JPEG format often benefit from faster JPEG decoding on CPU such as libjpeg-turbo and Pillow-SIMD, and on GPU such as torchvision.io.decode_jpeg and Nvidia DALI.
+As this [example](https://colab.research.google.com/drive/1NMaLS8PG0eYhbd8IxQAajXgXNIZ_AvHo?usp=sharing) shows, Nvidia DALI is about 20% faster than torchvision, even on an old K80 GPU.
+
+## Model Inference Optimizations
+
+Start model inference optimization only after other factors, the “low-hanging fruit”, have been extensively evaluated and addressed.
+
+- Use fp16 for GPU inference.  The speed will most likely more than double on newer GPUs with tensor cores, with negligible accuracy degradation.  Technically fp16 is a type of quantization but since it seldom suffers from loss of accuracy for inference it should always be explored. As shown in this [article](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#abstract), use of fp16 offers speed up in large neural network applications.
+
+- Use model quantization (i.e. int8) for CPU inference.  Explore different quantization options: dynamic quantization, static quantization, and quantization aware training, as well as tools such as Intel Neural Compressor that provide more sophisticated quantization methods. It is worth noting that quantization comes with some loss in accuracy and might not always offer significant speed up on some hardware thus this might not always be the right approach.
+
+- Balance throughput and latency with smart batching.  While meeting the latency SLA try larger batch sizes to increase the throughput.
+
+- Try optimized inference engines such as onnxruntime, tensorRT, lightseq, ctranslate-2, etc.  These engines often provide additional optimizations such as operator fusion, in addition to model quantization.
+
+- Try model distillation.  This is more involved and often requires training data, but the potential gain can be large.  For example, MiniLM achieves 99% the accuracy of the original BERT base model while being 2X faster.
+
+- If working on CPU, you can try core pinning. You can find more information on how to work with this [in this blog post](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex#grokking-pytorch-intel-cpu-performance-from-first-principles).
+
+- For batch processing on sequences with different lengths, sequence bucketing could potentially improve the throughput by 2X.  In this case, a simple implementation of sequence bucketing is to sort all input by sequence length before feeding them to the model, as this reduces unnecessary padding when batching the sequences.
+
+While this checklist is not exhaustive, going through the items will likely help you squeeze more performance out of your model inference pipeline.
diff --git a/docs/performance_guide.md b/docs/performance_guide.md
index d72c31a4f2..6804f82d26 100644
--- a/docs/performance_guide.md
+++ b/docs/performance_guide.md
@@ -1,6 +1,8 @@
 # [Performance Guide](#performance-guide)
 In case you're interested in optimizing the memory usage, latency or throughput of a PyTorch model served with TorchServe, this is the guide for you.
 
+We have also created a quick checklist here for extra things to try outside of what is covered on this page. You can find the checklist [here](performance_checklist.md).
+
 ## Optimizing PyTorch
 
 There are many tricks to optimize PyTorch models for production including but not limited to distillation, quantization, fusion, pruning, setting environment variables and we encourage you to benchmark and see what works best for you.
@@ -92,3 +94,7 @@ Visit this [link]( https://github.com/pytorch/kineto/tree/main/tb_plugin) to lea
 <h4>TorchServe on the Animated Drawings App</h4>
 
 For some insight into fine tuning TorchServe performance in an application, take a look at this [article](https://pytorch.org/blog/torchserve-performance-tuning/). The case study shown here uses the Animated Drawings App form Meta to improve TorchServe Performance.
+
+<h4>Performance Checklist</h4>
+
+We have also created a quick checklist here for extra things to try outside of what is covered on this page. You can find the checklist [here](performance_checklist.md).
diff --git a/kubernetes/AKS/README.md b/kubernetes/AKS/README.md
index 4948e10f14..99b6074fe4 100644
--- a/kubernetes/AKS/README.md
+++ b/kubernetes/AKS/README.md
@@ -291,7 +291,7 @@ az group delete --name myResourceGroup --yes --no-wait
 ```
 
 ## Troubleshooting
-  
+
 
   **Troubleshooting Azure Cli login**
 
@@ -299,11 +299,11 @@ az group delete --name myResourceGroup --yes --no-wait
   Otherwise, open a browser page at https://aka.ms/devicelogin and enter the authorization code displayed in your terminal.
   If no web browser is available or the web browser fails to open, use device code flow with az login --use-device-code.
   Or you can login with your credential in command line, more details, see https://docs.microsoft.com/en-us/cli/azure/authenticate-azure-cli.
-  
+
   **Troubleshooting Azure resource for AKS cluster creation**
-  
-  * Check AKS available region, https://azure.microsoft.com/en-us/explore/global-infrastructure/products-by-region/?products=kubernetes-service 
+
+  * Check AKS available region, https://azure.microsoft.com/en-us/explore/global-infrastructure/products-by-region/
   * Check AKS quota and VM size limitation, https://docs.microsoft.com/en-us/azure/aks/quotas-skus-regions
   * Check whether your subscription has enough quota to create AKS cluster, https://docs.microsoft.com/en-us/azure/networking/check-usage-against-limits
-  
+
   **For more AKS troubleshooting, please visit https://docs.microsoft.com/en-us/azure/aks/troubleshooting**
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 9f6e6e8aab..9aa7a9223f 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1071,6 +1071,19 @@ meshConfig
 baseimage
 cuDNN
 Xformer
+MiniLM
+SIMD
+SLA
+htop
+jpeg
+libjpeg
+lightseq
+multithreading
+onnxruntime
+pipelining
+tensorRT
+utilizations
+ctranslate
 grpcurl
 ExamplePercentMetric
 HandlerMethodTime

From bb4eb8b46977757d2e3659ccaa37aea38f39efc1 Mon Sep 17 00:00:00 2001
From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com>
Date: Thu, 24 Aug 2023 19:50:05 -0400
Subject: [PATCH 11/11] Updates to Index and FAQs pages (#2524)

Adding LMI page to Serve landing page.
Adding CPU performance to FAQs
Minor update to CPU performance section in performance doc

Co-authored-by: Geeta Chauhan <4461127+chauhang@users.noreply.github.com>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
---
 docs/FAQs.md              | 23 +++++++++++++++++++----
 docs/index.rst            |  7 +++++++
 docs/performance_guide.md | 10 ++++++++--
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/docs/FAQs.md b/docs/FAQs.md
index 4c9be8a06d..348414d765 100644
--- a/docs/FAQs.md
+++ b/docs/FAQs.md
@@ -1,6 +1,7 @@
 # FAQ'S
 Contents of this document.
 * [General](#general)
+* [Performance](#performance)
 * [Deployment and config](#deployment-and-config)
 * [API](#api)
 * [Handler](#handler)
@@ -34,9 +35,23 @@ No, As of now only python based models are supported.
 Torchserve is derived from Multi-Model-Server. However, Torchserve is specifically tuned for Pytorch models. It also has new features like Snapshot and model versioning.
 
 ### How to decode international language in inference response on client side?
-By default, Torchserve uses utf-8 to encode if the inference response is string. So client can use utf-8 to decode. 
+By default, Torchserve uses utf-8 to encode if the inference response is string. So client can use utf-8 to decode.
 
-If a model converts international language string to bytes, client needs to use the codec mechanism specified by the model such as in https://github.com/pytorch/serve/blob/master/examples/nmt_transformer/model_handler_generalized.py#L55
+If a model converts international language string to bytes, client needs to use the codec mechanism specified by the model such as in https://github.com/pytorch/serve/blob/master/examples/nmt_transformer/model_handler_generalized.py
+
+## Performance
+
+Relevant documents.
+- [Performance Guide](performance_guide.md)
+
+### How do I improve TorchServe performance on CPU?
+CPU performance is heavily influenced by launcher core pinning. We recommend setting the following properties in your `config.properties`:
+
+```bash
+cpu_launcher_enable=true
+cpu_launcher_args=--use_logical_core
+```
+More background on improving CPU performance can be found in this [blog post](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex#grokking-pytorch-intel-cpu-performance-from-first-principles).
 
 ## Deployment and config
 Relevant documents.
@@ -97,7 +112,7 @@ TorchServe looks for the config.property file according to the order listed in t
 
 - [models](configuration.md): Defines a list of models' configuration in config.property. A model's configuration can be overridden by [management API](management_api.md). It does not decide which models will be loaded during TorchServe start. There is no relationship b.w "models" and "load_models" (ie. TorchServe command line option [--models](configuration.md)).
 
-### 
+###
 
 ## API
 Relevant documents
@@ -133,7 +148,7 @@ Refer to [default handlers](default_handlers.md) for more details.
 
 ### Is it possible to deploy Hugging Face models?
 Yes, you can deploy Hugging Face models using a custom handler.
-Refer to [HuggingFace_Transformers](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/README.md#huggingface-transformers) for example. 
+Refer to [HuggingFace_Transformers](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/README.md#huggingface-transformers) for example.
 
 ## Model-archiver
  Relevant documents
diff --git a/docs/index.rst b/docs/index.rst
index 3b41e704c3..d8ee4ee63c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -56,6 +56,13 @@ What's going on in TorchServe?
    :link: performance_guide.html
    :tags: Performance,Troubleshooting
 
+.. customcarditem::
+   :header: Large Model Inference
+   :card_description: Serving Large Models with TorchServe
+   :image: https://github.com/raw/pytorch/serve/master/docs/images/ts-lmi-internal.png
+   :link: large_model_inference.html
+   :tags: Large-Models,Performance
+
 .. customcarditem::
    :header: Troubleshooting
    :card_description: Various updates on Torcherve and use cases.
diff --git a/docs/performance_guide.md b/docs/performance_guide.md
index 6804f82d26..2395a208ca 100644
--- a/docs/performance_guide.md
+++ b/docs/performance_guide.md
@@ -44,11 +44,17 @@ TorchServe exposes configurations that allow the user to configure the number of
 
 <h4>TorchServe On CPU </h4>
 
-If working with TorchServe on a CPU here are some things to consider that could improve performance:
+If working with TorchServe on a CPU you can improve performance by setting the following in your `config.properties`:
+
+```bash
+cpu_launcher_enable=true
+cpu_launcher_args=--use_logical_core
+```
+These settings improve performance significantly through launcher core pinning.
+The theory behind this improvement is discussed in [this blog](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex#grokking-pytorch-intel-cpu-performance-from-first-principles) which can be quickly summarized as:
 * In a hyperthreading enabled system, avoid logical cores by setting thread affinity to physical cores only via core pinning.
 * In a multi-socket system with NUMA, avoid cross-socket remote memory access by setting thread affinity to a specific socket via core pinning.
 
-These principles can be automatically configured via an easy to use launch script which has already been integrated into TorchServe. For more information take a look at this [case study](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex#grokking-pytorch-intel-cpu-performance-from-first-principles) which dives into these points further with examples and explanations from first principles.
 
 <h4>TorchServe on GPU</h4>