Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[server] Update server routes to be compliant with MLServer #1237

Merged
merged 16 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/openai-server/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class UsageInfo(BaseModel):

class ChatCompletionRequest(BaseModel):
model: str
messages: Union[str, List[Dict[str, str]]]
messages: Union[str, List[str]]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 1.0
n: Optional[int] = 1
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ def _setup_entry_points() -> Dict:
"deepsparse.benchmark_pipeline=deepsparse.benchmark.benchmark_pipeline:main", # noqa E501
"deepsparse.benchmark_sweep=deepsparse.benchmark.benchmark_sweep:main",
"deepsparse.server=deepsparse.server.cli:main",
"deepsparse.openai=deepsparse.server.cli:openai",
"deepsparse.object_detection.annotate=deepsparse.yolo.annotate:main",
"deepsparse.yolov8.annotate=deepsparse.yolov8.annotate:main",
"deepsparse.yolov8.eval=deepsparse.yolov8.validation:main",
Expand Down
39 changes: 34 additions & 5 deletions src/deepsparse/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,35 @@ Commands:
config Run the server using configuration from a .yaml file.
task Run the server using configuration with CLI options, which can...
```
---
<h3>Note on the latest server release</h3>

Endpoints have now been updated such that all base routes and endpoints added for
inference will follow `/v2/models/<route>/infer` for inference. Additionally, a series
of other endpoints have been added for each new configured endpoint,
including `/v2/models/<route>/ready` and `/v2/models/<route>`, providing metadata and
health checks for the pipelines available through the endpoint.

For example: If previously the following route `/pruned/model_1` was provided,
the following endpoint would be avaialble:

```
http://localhost:<port>/puned/model_1
```

Now, the following endpoints are available:

```
http://localhost:<port>/v2/models/puned/model_1/infer
http://localhost:<port>/v2/models/puned/model_1/ready
http://localhost:<port>/v2/models/puned/model_1
```

The same can be expected when a name is provided in the config file instead of a route.
When neither a name or route is provided, a name will be generated for the endpoint,
using the task provided (e.g question_answering will create question_answering-0)

---

### Single Model Inference

Expand All @@ -84,7 +113,7 @@ To make a request to your server, use the `requests` library and pass the reques
```python
import requests

url = "http://localhost:5543/predict"
url = "http://localhost:5543/v2/models/question_answering-0/infer"

obj = {
"question": "Who is Mark?",
Expand All @@ -98,7 +127,7 @@ In addition, you can make a request with a `curl` command from terminal:

```bash
curl -X POST \
'http://localhost:5543/predict' \
'http://localhost:5543/v2/models/question_answering-0/infer' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
Expand All @@ -116,11 +145,11 @@ num_cores: 2
num_workers: 2
endpoints:
- task: question_answering
route: /unpruned/predict
route: /unpruned
model: zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/base-none
batch_size: 1
- task: question_answering
route: /pruned/predict
route: /pruned
model: zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni
batch_size: 1
```
Expand All @@ -135,7 +164,7 @@ You can send requests to a specific model by appending the model's `alias` from
```python
import requests

url = "http://localhost:5543/pruned/predict"
url = "http://localhost:5543/v2/models/pruned/infer"

obj = {
"question": "Who is Mark?",
Expand Down
77 changes: 43 additions & 34 deletions src/deepsparse/server/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@

from deepsparse.pipeline import SupportedTasks
from deepsparse.server.config import EndpointConfig, ServerConfig
from deepsparse.server.server import start_server
from deepsparse.server.deepsparse_server import DeepsparseServer
from deepsparse.server.openai_server import OpenAIServer
from deepsparse.server.sagemaker import SagemakerServer


SUPPORTED_INTEGRATIONS = ["local", "sagemaker", "openai"]
HOST_OPTION = click.option(
"--host",
type=str,
Expand Down Expand Up @@ -109,7 +112,7 @@

INTEGRATION_OPTION = click.option(
"--integration",
type=click.Choice(["local", "sagemaker"], case_sensitive=False),
type=click.Choice(SUPPORTED_INTEGRATIONS, case_sensitive=False),
default="local",
help=(
"Name of deployment integration that this server will be deployed to "
Expand Down Expand Up @@ -206,6 +209,20 @@ def main(
...
```
"""

def _fetch_server(integration: str, config_path: str):
if integration == "local":
return DeepsparseServer(server_config=config_path)
elif integration == "sagemaker":
return SagemakerServer(server_config=config_path)
elif integration == "openai":
return OpenAIServer(server_config=config_path)
else:
raise ValueError(
f"{integration} is not a supported integration. Must be "
f"one of {SUPPORTED_INTEGRATIONS}."
)

if ctx.invoked_subcommand is not None:
return

Expand Down Expand Up @@ -234,14 +251,33 @@ def main(
config_path = os.path.join(tmp_dir, "server-config.yaml")
with open(config_path, "w") as fp:
yaml.dump(cfg.dict(), fp)
start_server(
config_path, host, port, log_level, hot_reload_config=hot_reload_config

server = _fetch_server(integration=integration, config_path=config_path)
server.start_server(
host, port, log_level, hot_reload_config=hot_reload_config
)

if config_file is not None:
start_server(
config_file, host, port, log_level, hot_reload_config=hot_reload_config
)
server = _fetch_server(integration=integration, config_path=config_file)
server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)


@main.command(
context_settings=dict(
token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
),
)
@click.argument("config-file", type=str)
@HOST_OPTION
@PORT_OPTION
@LOG_LEVEL_OPTION
@HOT_RELOAD_OPTION
def openai(
config_file: str, host: str, port: int, log_level: str, hot_reload_config: bool
):

server = OpenAIServer(server_config=config_file)
server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)


@main.command(
Expand All @@ -264,9 +300,6 @@ def config(
"Use the `--config_file` argument instead.",
category=DeprecationWarning,
)
start_server(
config_path, host, port, log_level, hot_reload_config=hot_reload_config
)


@main.command(
Expand Down Expand Up @@ -311,30 +344,6 @@ def task(
category=DeprecationWarning,
)

cfg = ServerConfig(
num_cores=num_cores,
num_workers=num_workers,
integration=integration,
endpoints=[
EndpointConfig(
task=task,
name=f"{task}",
route="/predict",
model=model_path,
batch_size=batch_size,
)
],
loggers={},
)

with TemporaryDirectory() as tmp_dir:
config_path = os.path.join(tmp_dir, "server-config.yaml")
with open(config_path, "w") as fp:
yaml.dump(cfg.dict(), fp)
start_server(
config_path, host, port, log_level, hot_reload_config=hot_reload_config
)


if __name__ == "__main__":
main()
168 changes: 168 additions & 0 deletions src/deepsparse/server/deepsparse_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from functools import partial

from deepsparse import Pipeline
from deepsparse.server.config import EndpointConfig
from deepsparse.server.server import CheckReady, ModelMetaData, ProxyPipeline, Server
from fastapi import FastAPI


_LOGGER = logging.getLogger(__name__)


class DeepsparseServer(Server):
def _add_routes(self, app):
@app.get("/v2/health/ready", tags=["health"], response_model=CheckReady)
@app.get("/v2/health/live", tags=["health"], response_model=CheckReady)
def _check_health():
return CheckReady(status="OK")

@app.get("/v2", tags=["metadata", "server"], response_model=str)
def _get_server_info():
return "This is the deepsparse server. Hello!"

@app.post("/endpoints", tags=["endpoints"], response_model=bool)
def _add_endpoint_endpoint(cfg: EndpointConfig):
if cfg.name is None:
cfg.name = f"endpoint-{len(app.routes)}"
self._add_endpoint(
app,
cfg,
)
# force regeneration of the docs
app.openapi_schema = None
return True

@app.delete("/endpoints", tags=["endpoints"], response_model=bool)
def _delete_endpoint(cfg: EndpointConfig):
_LOGGER.info(f"Deleting endpoint for {cfg}")
matching = [r for r in app.routes if r.path == cfg.route]
assert len(matching) == 1
app.routes.remove(matching[0])
# force regeneration of the docs
app.openapi_schema = None
return True

for endpoint_config in self.server_config.endpoints:
self._add_endpoint(
app,
endpoint_config,
)

_LOGGER.info(f"Added endpoints: {[route.path for route in app.routes]}")
return app

def _add_endpoint(
self,
app: FastAPI,
endpoint_config: EndpointConfig,
):
pipeline_config = endpoint_config.to_pipeline_config()
pipeline_config.kwargs["executor"] = self.executor

_LOGGER.info(f"Initializing pipeline for '{endpoint_config.name}'")
pipeline = Pipeline.from_config(
pipeline_config, self.context, self.server_logger
)

_LOGGER.info(f"Adding endpoints for '{endpoint_config.name}'")
self._add_inference_endpoints(
app,
endpoint_config,
pipeline,
)
self._add_status_and_metadata_endpoints(app, endpoint_config, pipeline)

def _add_status_and_metadata_endpoints(
self,
app: FastAPI,
endpoint_config: EndpointConfig,
pipeline: Pipeline,
):
routes_and_fns = []
meta_and_fns = []

if endpoint_config.route:
endpoint_config.route = self.clean_up_route(endpoint_config.route)
route_ready = f"/v2/models{endpoint_config.route}/ready"
route_meta = f"/v2/models{endpoint_config.route}"
else:
route_ready = f"/v2/models/{endpoint_config.name}/ready"
route_meta = f"/v2/models/{endpoint_config.name}"

routes_and_fns.append((route_ready, Server.pipeline_ready))
meta_and_fns.append(
(route_meta, partial(Server.model_metadata, ProxyPipeline(pipeline)))
)

self._update_routes(
app=app,
routes_and_fns=meta_and_fns,
response_model=ModelMetaData,
methods=["GET"],
tags=["model", "metadata"],
)
self._update_routes(
app=app,
routes_and_fns=routes_and_fns,
response_model=CheckReady,
methods=["GET"],
tags=["model", "health"],
)

def _add_inference_endpoints(
self,
app: FastAPI,
endpoint_config: EndpointConfig,
pipeline: Pipeline,
):
routes_and_fns = []
if endpoint_config.route:
endpoint_config.route = self.clean_up_route(endpoint_config.route)
route = f"/v2/models{endpoint_config.route}/infer"
else:
route = f"/v2/models/{endpoint_config.name}/infer"

routes_and_fns.append(
(
route,
partial(
Server.predict,
ProxyPipeline(pipeline),
self.server_config.system_logging,
),
)
)
if hasattr(pipeline.input_schema, "from_files"):
routes_and_fns.append(
(
route + "/from_files",
partial(
Server.predict_from_files,
ProxyPipeline(pipeline),
self.server_config.system_logging,
),
)
)

self._update_routes(
app=app,
routes_and_fns=routes_and_fns,
response_model=pipeline.output_schema,
methods=["POST"],
tags=["model", "inference"],
)
Loading
Loading