Skip to content

Commit

Permalink
[server] Update server routes to be compliant with MLServer (#1237)
Browse files Browse the repository at this point in the history
* update/clean-up server to match mlserver docs

* update server tests

* add back ping

* [server] Refactor + OpenAI Chat Completion Support (#1288)

* refactor server for different integrations; additional functionality for chat completion streaming and non streaming

* further refactor server

* add support such that openai can host multiple models

* update all tests

* fix output for n > 1

* add inline comment explaining ProxyPipeline

* [server] Update OpenAI Model Support (#1300)

* update server

* allow users to send requests with new models

* use v1; move around baseroutes

* add openai path

* PR comments

* clean-up output classes to be dataclasses, add docstrings, cleanup generation kwargs

* update readme, update route cleaning, update docstring

* fix README for QA
  • Loading branch information
dsikka committed Oct 11, 2023
1 parent 1969d5d commit 639e9e4
Show file tree
Hide file tree
Showing 19 changed files with 1,342 additions and 430 deletions.
2 changes: 1 addition & 1 deletion examples/openai-server/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class UsageInfo(BaseModel):

class ChatCompletionRequest(BaseModel):
model: str
messages: Union[str, List[Dict[str, str]]]
messages: Union[str, List[str]]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 1.0
n: Optional[int] = 1
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ def _setup_entry_points() -> Dict:
"deepsparse.benchmark_pipeline=deepsparse.benchmark.benchmark_pipeline:main", # noqa E501
"deepsparse.benchmark_sweep=deepsparse.benchmark.benchmark_sweep:main",
"deepsparse.server=deepsparse.server.cli:main",
"deepsparse.openai=deepsparse.server.cli:openai",
"deepsparse.object_detection.annotate=deepsparse.yolo.annotate:main",
"deepsparse.yolov8.annotate=deepsparse.yolov8.annotate:main",
"deepsparse.yolov8.eval=deepsparse.yolov8.validation:main",
Expand Down
46 changes: 35 additions & 11 deletions src/deepsparse/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,35 @@ Commands:
config Run the server using configuration from a .yaml file.
task Run the server using configuration with CLI options, which can...
```
---
<h3>Note on the latest server release</h3>
Endpoints have now been updated such that all base routes and endpoints added for
inference will follow `/v2/models/<route>/infer` for inference. Additionally, a series
of other endpoints have been added for each new configured endpoint,
including `/v2/models/<route>/ready` and `/v2/models/<route>`, providing metadata and
health checks for the pipelines available through the endpoint.
For example: If previously the following route `/pruned/model_1` was provided,
the following endpoint would be avaialble:
```
http://localhost:5543/pruned/model_1
```
Now, the following endpoints are available:
```
http://localhost:5543/v2/models/pruned/model_1/infer
http://localhost:5543/v2/models/pruned/model_1/ready
http://localhost:5543/v2/models/pruned/model_1
```
The same can be expected when a name is provided in the config file instead of a route.
When neither a name or route is provided, a name will be generated for the endpoint,
using the task provided.
---
### Single Model Inference
Expand All @@ -84,7 +113,7 @@ To make a request to your server, use the `requests` library and pass the reques
```python
import requests

url = "http://localhost:5543/predict"
url = "http://localhost:5543/v2/models/question_answering/infer"

obj = {
"question": "Who is Mark?",
Expand All @@ -98,7 +127,7 @@ In addition, you can make a request with a `curl` command from terminal:

```bash
curl -X POST \
'http://localhost:5543/predict' \
'http://localhost:5543/v2/models/question_answering/infer' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
Expand All @@ -116,26 +145,21 @@ num_cores: 2
num_workers: 2
endpoints:
- task: question_answering
route: /unpruned/predict
route: /unpruned
model: zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none
batch_size: 1
- task: question_answering
route: /pruned/predict
route: /pruned
model: zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni
batch_size: 1
```
You can now run the server with the config file path using the `config` sub command:

```bash
deepsparse.server config config.yaml
```

You can send requests to a specific model by appending the model's `alias` from the `config.yaml` to the end of the request url. For example, to call the second model, you can send a request to its configured route:

```python
import requests

url = "http://localhost:5543/pruned/predict"
url = "http://localhost:5543/v2/models/pruned/infer"

obj = {
"question": "Who is Mark?",
Expand All @@ -151,5 +175,5 @@ All you need is to add `/docs` at the end of your host URL:

localhost:5543/docs

![alt text](./img/swagger_ui.png)
![alt text](./img/endpoints.png)

58 changes: 45 additions & 13 deletions src/deepsparse/server/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@

from deepsparse.pipeline import SupportedTasks
from deepsparse.server.config import EndpointConfig, ServerConfig
from deepsparse.server.server import start_server
from deepsparse.server.deepsparse_server import DeepsparseServer
from deepsparse.server.openai_server import OpenAIServer
from deepsparse.server.sagemaker import SagemakerServer


SUPPORTED_INTEGRATIONS = ["local", "sagemaker", "openai"]
HOST_OPTION = click.option(
"--host",
type=str,
Expand Down Expand Up @@ -109,7 +112,7 @@

INTEGRATION_OPTION = click.option(
"--integration",
type=click.Choice(["local", "sagemaker"], case_sensitive=False),
type=click.Choice(SUPPORTED_INTEGRATIONS, case_sensitive=False),
default="local",
help=(
"Name of deployment integration that this server will be deployed to "
Expand Down Expand Up @@ -234,14 +237,33 @@ def main(
config_path = os.path.join(tmp_dir, "server-config.yaml")
with open(config_path, "w") as fp:
yaml.dump(cfg.dict(), fp)
start_server(
config_path, host, port, log_level, hot_reload_config=hot_reload_config

server = _fetch_server(integration=integration, config_path=config_path)
server.start_server(
host, port, log_level, hot_reload_config=hot_reload_config
)

if config_file is not None:
start_server(
config_file, host, port, log_level, hot_reload_config=hot_reload_config
)
server = _fetch_server(integration=integration, config_path=config_file)
server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)


@main.command(
context_settings=dict(
token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
),
)
@click.argument("config-file", type=str)
@HOST_OPTION
@PORT_OPTION
@LOG_LEVEL_OPTION
@HOT_RELOAD_OPTION
def openai(
config_file: str, host: str, port: int, log_level: str, hot_reload_config: bool
):

server = OpenAIServer(server_config=config_file)
server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)


@main.command(
Expand All @@ -264,9 +286,6 @@ def config(
"Use the `--config_file` argument instead.",
category=DeprecationWarning,
)
start_server(
config_path, host, port, log_level, hot_reload_config=hot_reload_config
)


@main.command(
Expand Down Expand Up @@ -319,7 +338,6 @@ def task(
EndpointConfig(
task=task,
name=f"{task}",
route="/predict",
model=model_path,
batch_size=batch_size,
)
Expand All @@ -331,8 +349,22 @@ def task(
config_path = os.path.join(tmp_dir, "server-config.yaml")
with open(config_path, "w") as fp:
yaml.dump(cfg.dict(), fp)
start_server(
config_path, host, port, log_level, hot_reload_config=hot_reload_config

server = _fetch_server(integration=integration, config_path=config_path)
server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)


def _fetch_server(integration: str, config_path: str):
if integration == "local":
return DeepsparseServer(server_config=config_path)
elif integration == "sagemaker":
return SagemakerServer(server_config=config_path)
elif integration == "openai":
return OpenAIServer(server_config=config_path)
else:
raise ValueError(
f"{integration} is not a supported integration. Must be "
f"one of {SUPPORTED_INTEGRATIONS}."
)


Expand Down
168 changes: 168 additions & 0 deletions src/deepsparse/server/deepsparse_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from functools import partial

from deepsparse import Pipeline
from deepsparse.server.config import EndpointConfig
from deepsparse.server.server import CheckReady, ModelMetaData, ProxyPipeline, Server
from fastapi import FastAPI


_LOGGER = logging.getLogger(__name__)


class DeepsparseServer(Server):
def _add_routes(self, app):
@app.get("/v2/health/ready", tags=["health"], response_model=CheckReady)
@app.get("/v2/health/live", tags=["health"], response_model=CheckReady)
def _check_health():
return CheckReady(status="OK")

@app.get("/v2", tags=["metadata", "server"], response_model=str)
def _get_server_info():
return "This is the deepsparse server. Hello!"

@app.post("/endpoints", tags=["endpoints"], response_model=bool)
def _add_endpoint_endpoint(cfg: EndpointConfig):
if cfg.name is None:
cfg.name = f"endpoint-{len(app.routes)}"
self._add_endpoint(
app,
cfg,
)
# force regeneration of the docs
app.openapi_schema = None
return True

@app.delete("/endpoints", tags=["endpoints"], response_model=bool)
def _delete_endpoint(cfg: EndpointConfig):
_LOGGER.info(f"Deleting endpoint for {cfg}")
matching = [r for r in app.routes if r.path == cfg.route]
assert len(matching) == 1
app.routes.remove(matching[0])
# force regeneration of the docs
app.openapi_schema = None
return True

for endpoint_config in self.server_config.endpoints:
self._add_endpoint(
app,
endpoint_config,
)

_LOGGER.info(f"Added endpoints: {[route.path for route in app.routes]}")
return app

def _add_endpoint(
self,
app: FastAPI,
endpoint_config: EndpointConfig,
):
pipeline_config = endpoint_config.to_pipeline_config()
pipeline_config.kwargs["executor"] = self.executor

_LOGGER.info(f"Initializing pipeline for '{endpoint_config.name}'")
pipeline = Pipeline.from_config(
pipeline_config, self.context, self.server_logger
)

_LOGGER.info(f"Adding endpoints for '{endpoint_config.name}'")
self._add_inference_endpoints(
app,
endpoint_config,
pipeline,
)
self._add_status_and_metadata_endpoints(app, endpoint_config, pipeline)

def _add_status_and_metadata_endpoints(
self,
app: FastAPI,
endpoint_config: EndpointConfig,
pipeline: Pipeline,
):
routes_and_fns = []
meta_and_fns = []

if endpoint_config.route:
endpoint_config.route = self.clean_up_route(endpoint_config.route)
route_ready = f"/v2/models{endpoint_config.route}/ready"
route_meta = f"/v2/models{endpoint_config.route}"
else:
route_ready = f"/v2/models/{endpoint_config.name}/ready"
route_meta = f"/v2/models/{endpoint_config.name}"

routes_and_fns.append((route_ready, Server.pipeline_ready))
meta_and_fns.append(
(route_meta, partial(Server.model_metadata, ProxyPipeline(pipeline)))
)

self._update_routes(
app=app,
routes_and_fns=meta_and_fns,
response_model=ModelMetaData,
methods=["GET"],
tags=["model", "metadata"],
)
self._update_routes(
app=app,
routes_and_fns=routes_and_fns,
response_model=CheckReady,
methods=["GET"],
tags=["model", "health"],
)

def _add_inference_endpoints(
self,
app: FastAPI,
endpoint_config: EndpointConfig,
pipeline: Pipeline,
):
routes_and_fns = []
if endpoint_config.route:
endpoint_config.route = self.clean_up_route(endpoint_config.route)
route = f"/v2/models{endpoint_config.route}/infer"
else:
route = f"/v2/models/{endpoint_config.name}/infer"

routes_and_fns.append(
(
route,
partial(
Server.predict,
ProxyPipeline(pipeline),
self.server_config.system_logging,
),
)
)
if hasattr(pipeline.input_schema, "from_files"):
routes_and_fns.append(
(
route + "/from_files",
partial(
Server.predict_from_files,
ProxyPipeline(pipeline),
self.server_config.system_logging,
),
)
)

self._update_routes(
app=app,
routes_and_fns=routes_and_fns,
response_model=pipeline.output_schema,
methods=["POST"],
tags=["model", "inference"],
)
Loading

0 comments on commit 639e9e4

Please sign in to comment.