neuralmagic · robertgshaw2-neuralmagic · Jul 9, 2023 · Jul 9, 2023 · InquestGeronimo · Jul 10, 2023
diff --git a/examples/mlserver/README.md b/examples/mlserver/README.md
@@ -0,0 +1,75 @@
+# **Step 1: Installation**
+
+Install DeepSparse and MLServer.
+
+```bash
+pip install -r requirements.txt
+```
+
+# **Step 2: Write Custom Runtime**
+
+We need to write a [Custom Inference Runtime](https://mlserver.readthedocs.io/en/stable/user-guide/custom.html) to use DeepSparse within MLServer.
+
+### Implement `load()` and `predict()`
+
+First, we implement the `load()` and `predict()` methods in `models/text-classification-model/models.py`. Note that your implementation of the of `load()` and `predict()` will vary by the task that you choose.
+
+Here's an example for text classification:
+```python
+from mlserver import MLModel
+from mlserver.codecs import decode_args
+from typing import List
+from deepsparse import Pipeline
+
+class DeepSparseRuntime(MLModel):
+    async def load(self) -> bool:
+        # compiles the pipeline
+        self._pipeline = Pipeline.create(
+            task = self._settings.parameters.task,                          # from model-settings.json
+            model_path = self._settings.parameters.model_path,              # from model-settings.json
+            batch_size = self._settings.parameters.batch_size,              # from model-settings.json
+            sequence_length = self._settings.parameters.sequence_length,    # from model-settings.json
+        )
+        return True
+
+    @decode_args
+    async def predict(self, sequences: List[str]) -> List[str]:
+        # runs the inference
+        prediction = self._pipeline(sequences=sequences)
+        return prediction.labels
+```
+
+### Create `model-settings.json`
+
+Second, we create a config at `models/text-classification-model/model-settings.json`. In this file, we will specify the location of the implementation of the custom runtime as well as the 
+paramters of the deepsparse inference session.
+
+```json
+{
+    "name": "text-classification-model",
+    "implementation": "models.DeepSparseRuntime",
+    "parameters": {
+        "task": "text-classification",
+        "model_path": "zoo:nlp/sentiment_analysis/obert-base/pytorch/huggingface/sst2/pruned90_quant-none",
+        "batch_size": 1,
+        "sequence_length": 128
+    }
+}
+```
+
+# **Step 3: Launch MLServer**
+
+Launch the server with the CLI:
+
+```bash
+mlserver start ./models/text-classification-model/
+```
+
+# **Step 4: Send Inference Requests**
+
+Now, an inference endpoint is exposed at `http://localhost:8080/v2/models/text-classification-model/infer`. `client.py` is a sample script for requesting the endpoint.
+
+Run the following:
+```python
+python3 client.py
+```
diff --git a/examples/mlserver/client.py b/examples/mlserver/client.py
@@ -0,0 +1,27 @@
+import requests, threading
+
+NUM_THREADS = 2
+URL = "http://localhost:8080/v2/models/text-classification-model/infer"
+sentences = ["I hate using GPUs for inference", "I love using DeepSparse on CPUs"] * 100
+
+def tfunc(text):
+    inference_request = {
+        "inputs": [
+            {
+                "name": "sequences",
+                "shape": [1],
+                "datatype": "BYTES",
+                "data": [text],
+            },
+        ]
+    }   
+    resp = requests.post(URL, json=inference_request).json()
+    for output in resp["outputs"]:
+        print(output["data"])
+
+
+threads = [threading.Thread(target=tfunc, args=(sentence,)) for sentence in sentences[:NUM_THREADS]]
+for thread in threads:
+    thread.start()
+for thread in threads:
+    thread.join()
-threads = [threading.Thread(target=tfunc, args=(sentence,)) for sentence in sentences[:NUM_THREADS]]
-for thread in threads:
-    thread.start()
-for thread in threads:
-    thread.join()
+from concurrent.futures.thread import ThreadPoolExecutor
+
+threadpool = ThreadPoolExecutor(max_workers=NUM_THREADS)
+
+results = threadpool.map(tfunc, sentences)
-threads = [threading.Thread(target=tfunc, args=(sentence,)) for sentence in sentences[:NUM_THREADS]]
-for thread in threads:
-    thread.start()
-for thread in threads:
-    thread.join()
+from concurrent.futures.thread import ThreadPoolExecutor
+
+threadpool = ThreadPoolExecutor(max_workers=NUM_THREADS)
+
+results = threadpool.map(tfunc, sentences)
diff --git a/examples/mlserver/models/text-classification-model/model-settings.json b/examples/mlserver/models/text-classification-model/model-settings.json
@@ -0,0 +1,10 @@
+{
+    "name": "text-classification-model",
+    "implementation": "models.DeepSparseRuntime",
+    "parameters": {
+        "task": "text-classification",
+        "model_path": "zoo:nlp/sentiment_analysis/obert-base/pytorch/huggingface/sst2/pruned90_quant-none",
+        "batch_size": 1,
+        "sequence_length": 128
+    }
+}
diff --git a/examples/mlserver/models/text-classification-model/models.py b/examples/mlserver/models/text-classification-model/models.py
@@ -0,0 +1,19 @@
+from mlserver import MLModel
+from mlserver.codecs import decode_args
+from typing import List
+from deepsparse import Pipeline
+
+class DeepSparseRuntime(MLModel):
+    async def load(self) -> bool:
+        self._pipeline = Pipeline.create(
+            task = self._settings.parameters.task,
+            model_path = self._settings.parameters.model_path,
+            batch_size = self._settings.parameters.batch_size,
+            sequence_length = self._settings.parameters.sequence_length,
+        )
+        return True
+
+    @decode_args
+    async def predict(self, sequences: List[str]) -> List[str]:
+        prediction = self._pipeline(sequences=sequences)
+        return prediction.labels
diff --git a/examples/mlserver/requirements.txt b/examples/mlserver/requirements.txt
@@ -0,0 +1,2 @@
+mlserver
+deepsparse[transformers]