neuralmagic · dbogunowicz · Jun 28, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 6, 2023
diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
@@ -54,7 +54,6 @@
     "Scheduler",
     "Context",
     "MultiModelEngine",
-    "KVCacheEngine",
     "BaseEngine",
 ]
 
@@ -292,6 +291,7 @@ def __init__(
                     self._num_streams,
                     self._scheduler.value,
                     None,
+                    self._kv_cache_input_idxs,
                 )
         else:
             self._eng_net = LIB.deepsparse_engine(
@@ -301,6 +301,7 @@ def __init__(
                 self._num_streams,
                 self._scheduler.value,
                 None,
+                self._kv_cache_input_idxs,
             )
 
     def __call__(
@@ -845,52 +846,6 @@ def __init__(
             )
 
 
-class KVCacheEngine(Engine):
-    """
-    Engine that can do kv caching.
-    """
-
-    def __init__(
-        self,
-        model: Union[str, "Model", "File"],
-        batch_size: int = 1,
-        num_cores: int = None,
-        num_streams: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        kv_cache_bools: List[bool] = None,
-        prev_cache_length: int = 0,
-    ):
-        BaseEngine.construct(
-            self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
-        )
-
-        if kv_cache_bools is None:
-            # If no list was provided, then we assume all outputs except for the first are KV caches
-            # Note: In the future we can look at the names of outputs to be more sure
-            #
-            # Create a boolean list of every output of the model
-            output_names = get_output_names(self._model_path)
-            kv_cache_bools = [True for i in range(len(output_names))]
-            # Assume first input is logits and logits ought not to be cached
-            kv_cache_bools[0] = False
-
-        num_streams = _validate_num_streams(num_streams, self._num_cores)
-        if self._input_shapes:
-            raise NotImplementedError("Don't do this yet :)")
-        else:
-            self._eng_net = LIB.deepsparse_engine(
-                self._model_path,
-                self._batch_size,
-                self._num_cores,
-                num_streams,
-                self._scheduler.value,
-                None,
-                kv_cache_bools,
-                prev_cache_length,
-            )
-
-
 def compile_model(
     model: Union[str, "Model", "File"],
     batch_size: int = 1,

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(x) for x in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
@@ -567,6 +567,34 @@ def _register_pipeline_tasks_decorator(pipeline_class: Pipeline):
 
         return _register_pipeline_tasks_decorator
 
+    @staticmethod
+    def create_engine(
+        onnx_file_path: str,
+        engine_type: str,
+        engine_args: Dict,
+        context: Optional[Context] = None,
+    ) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        engine_type = engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if context is not None and isinstance(context, Context):
+                engine_args.pop("num_cores", None)
+                engine_args.pop("scheduler", None)
+                engine_args["context"] = context
+                return MultiModelEngine(
+                    model=onnx_file_path,
+                    **engine_args,
+                )
+            return Engine(onnx_file_path, **engine_args)
+
+        if engine_type == ORT_ENGINE:
+            return ORTEngine(onnx_file_path, **engine_args)
+
+        raise ValueError(
+            f"Unknown engine_type {engine_type}. Supported values include: "
+            f"{SUPPORTED_PIPELINE_ENGINES}"
+        )
+
     @classmethod
     def from_config(
         cls,
@@ -791,26 +819,10 @@ def engine_forward(self, engine_inputs: List[numpy.ndarray]) -> List[numpy.ndarr
         """
         return self.engine(engine_inputs)
 
-    def _initialize_engine(self) -> Union[Engine, ORTEngine]:
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_file_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_file_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_file_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
+    def _initialize_engine(self) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        return Pipeline.create_engine(
+            self.onnx_file_path, self.engine_type, self._engine_args, self.context
+        )
 
     def _identifier(self):
         # get pipeline identifier; used in the context of logging

diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
@@ -95,6 +95,12 @@ class SupportedTasks:
         ),
     )
 
+    text_generation = namedtuple("text_generation", ["opt", "codegen", "bloom"])(
+        codegen=AliasedTask("codegen", []),
+        opt=AliasedTask("opt", []),
+        bloom=AliasedTask("bloom", []),
+    )
+
     image_classification = namedtuple("image_classification", ["image_classification"])(
         image_classification=AliasedTask(
             "image_classification",
@@ -150,6 +156,9 @@ def check_register_task(
             # custom task, register the CustomPipeline
             import deepsparse.pipelines.custom_pipeline  # noqa: F401
 
+        elif cls.is_text_generation(task):
+            import deepsparse.transformers.pipelines.text_generation  # noqa: F401
+
         elif cls.is_nlp(task):
             # trigger transformers pipelines to register with Pipeline.register
             import deepsparse.transformers.pipelines  # noqa: F401
@@ -193,6 +202,20 @@ def check_register_task(
                 f"{list(all_tasks)}"
             )
 
+    @classmethod
+    def is_text_generation(cls, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether it is a text generation task
+            such as codegen
+        :return: True if it is a text generation task, False otherwise
+        """
+        return any(
+            [
+                text_generation_task.matches(task)
+                for text_generation_task in cls.text_generation
+            ]
+        )
+
     @classmethod
     def is_nlp(cls, task: str) -> bool:
         """

diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md
@@ -10,6 +10,7 @@ methods such as [pruning](https://neuralmagic.com/blog/pruning-overview/) and [q
 These techniques result in significantly more performant and smaller models with limited to no effect on the baseline metrics. 
 
 This integration currently supports several fundamental NLP tasks:
+- **Text Generation** - given the input prompt, generate an output text sequence (e.g. to fill in incomplete text or paraphrase part of the prompt)
 - **Question Answering** - posing questions about a document
 - **Sentiment Analysis** - assigning a sentiment to a piece of text
 - **Text Classification** - assigning a label or class to a piece of text (e.g duplicate question pairing)
@@ -30,10 +31,12 @@ compatible with our [hardware requirements](https://docs.neuralmagic.com/deepspa
 By default, to deploy the transformer using DeepSparse Engine it is required to supply the model in the ONNX format along with the HuggingFace supporting files. 
 This grants the engine the flexibility to serve any model in a framework-agnostic environment. 
 
-The DeepSparse pipelines require the following files within a folder on the local server to properly load a Transformers model:
+In general, the DeepSparse pipelines require the following files within a folder on the local server to properly load a Transformers model:
 - `model.onnx`: The exported Transformers model in the [ONNX format](https://github.com/onnx/onnx).
-- `tokenizer.json`: The [HuggingFace compatible tokenizer configuration](https://huggingface.co/docs/transformers/fast_tokenizers) used with the model.
+- `model_kvcache.onnx` (optional): the ONNX model with the KV Cache support (akin to the Transformers model with `use_cache = True`. Specific for the `text-generation` integration.
 - `config.json`: The [HuggingFace compatible configuration file](https://huggingface.co/docs/transformers/main_classes/configuration) used with the model.
+- `tokenizer_config.json`: The [HuggingFace compatible tokenizer configuration](https://huggingface.co/docs/transformers/fast_tokenizers) used with the model.
+- `tokenizer.json`, `special_tokens_map.json`, `vocab.json`, `merges.txt` (optional): Other files that may be required by a tokenizer
 
 Below we describe two possibilities to obtain the required structure.
 
@@ -48,7 +51,7 @@ sparseml.transformers.export_onnx --task question-answering --model_path model_p
 ```
 
 This creates `model.onnx` file, in the directory of your `model_path`(e.g. `/trained_model/model.onnx`). 
-The `tokenizer.json` and `config.json` are stored under the `model_path` folder as well, so a DeepSparse pipeline ca be directly instantiated by using that folder after export (e.g. `/trained_model/`).
+Any additional, required files, such as e.g.`tokenizer.json` or `config.json`, are stored under the `model_path` folder as well, so a DeepSparse pipeline ca be directly instantiated by using that folder after export (e.g. `/trained_model/`).
 
 ####  SparseZoo Stub
 Alternatively, you can skip the process of the ONNX model export by using Neural Magic's [SparseZoo](https://sparsezoo.neuralmagic.com/). The SparseZoo contains pre-sparsified models and SparseZoo stubs enable you to reference any model on the SparseZoo in a convenient and predictable way.
@@ -137,6 +140,47 @@ response.text
 
 >> '{"score":0.9534820914268494,"start":8,"end":14,"answer":"batman"}'
 ```
+### Text Generation
+The text generation task generates a sequence of words given the prompt. Popular text generation LLMs (Large Language Models) are used
+for the chats (the instruction models), code generation, text summarization, or filling out the missing text.
+are used for chats or following instructions are also covered in this task. The following example uses a sparsified text classification
+OPT model to complete the prompt
+
+[List of available SparseZoo Text Generation Models](
+https://sparsezoo.neuralmagic.com/?useCase=text_generation)
+
+#### Python Pipeline
+```python
+from deepsparse import Pipeline
+
+opt_pipeline = Pipeline.create(task="opt")
+
+inference = opt_pipeline("Who is the president of the United States?")
+
+>> 'The president of the United States is the head of the executive branch of government...'
+```
+
+#### HTTP Server
+Spinning up:
+```bash
+deepsparse.server \
+    task text-generation \
+    --model_path # TODO: Pending until text generation models get uploaded to SparseZoo
+```
+
+Making a request:
+```python
+import requests
+
+url = "http://localhost:5543/predict" # Server's port default to 5543
+
+obj = {"sequence": "Who is the president of the United States?"}
+
+response = requests.post(url, json=obj)
+response.text
+
+>> 'The president of the United States is the head of the executive branch of government...'
+```
 
 ### Sentiment Analysis
 The sentiment analysis task takes in a sentence and classifies its sentiment. The following example

diff --git a/src/deepsparse/transformers/engines/__init__.py b/src/deepsparse/transformers/engines/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .nl_decoder_engine import *