Skip to content

Commit

Permalink
openai compatible chat api (#441)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhikaiiii authored May 22, 2024
1 parent 0063302 commit cc0e507
Show file tree
Hide file tree
Showing 10 changed files with 350 additions and 187 deletions.
2 changes: 1 addition & 1 deletion modelscope_agent/agents/role_play.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def _run(self,
lang: str = 'zh',
**kwargs):

chat_mode = kwargs.get('chat_mode', False)
chat_mode = kwargs.pop('chat_mode', False)
tools = kwargs.get('tools', None)
tool_choice = kwargs.get('tool_choice', 'auto')

Expand Down
15 changes: 15 additions & 0 deletions modelscope_agent/llm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def __init__(self,
self.model_server = model_server
self.max_length = 6000

self.last_call_usage_info = {}

# It is okay to use the same code to handle the output
# regardless of whether stream is True or False, as follows:
# ```py
Expand Down Expand Up @@ -239,3 +241,16 @@ def check_max_length(self, messages: Union[List[Dict], str]) -> bool:

def get_max_length(self) -> int:
return self.max_length

def get_usage(self) -> Dict:
return self.last_call_usage_info

def stat_last_call_token_info(self, response):
try:
self.last_call_usage_info = response.usage.dict()
return response
except AttributeError:
for chunk in response:
if hasattr(chunk, 'usage') and chunk.usage is not None:
self.last_call_usage_info = chunk.usage.dict()
yield chunk
21 changes: 21 additions & 0 deletions modelscope_agent/llm/dashscope.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ def _chat_stream(self,
if kwargs.get('seed', None):
generation_input['seed'] = kwargs.get('seed')
response = dashscope.Generation.call(**generation_input)
print(response)
response = self.stat_last_call_token_info(response)
return stream_output(response, **kwargs)

def _chat_no_stream(self,
Expand All @@ -119,6 +121,7 @@ def _chat_no_stream(self,
top_p=top_p,
)
if response.status_code == HTTPStatus.OK:
self.stat_last_call_token_info(response)
return response.output.choices[0].message.content
else:
err = 'Error code: %s, error message: %s' % (
Expand All @@ -127,6 +130,24 @@ def _chat_no_stream(self,
)
return err

def stat_last_call_token_info(self, response):
try:
self.last_call_usage_info = {
'prompt_tokens': response.usage.input_tokens,
'completion_tokens': response.usage.output_tokens,
'total_tokens': response.usage.total_tokens
}
return response
except AttributeError:
for chunk in response:
# if hasattr(chunk.output, 'usage'):
self.last_call_usage_info = {
'prompt_tokens': chunk.usage.input_tokens,
'completion_tokens': chunk.usage.output_tokens,
'total_tokens': chunk.usage.total_tokens
}
yield chunk


@register_llm('dashscope_qwen')
@register_llm('dashscope_qwen1.5')
Expand Down
28 changes: 28 additions & 0 deletions modelscope_agent/llm/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(self, model: str, model_server: str, **kwargs):
host = kwargs.get('host', 'http://localhost:11434')
self.client = ollama.Client(host=host)
self.model = model
self.client.pull(self.model)

def _chat_stream(self,
messages: List[Dict],
Expand All @@ -25,6 +26,7 @@ def _chat_stream(self,
f'stop: {str(stop)}, stream: True, args: {str(kwargs)}')
stream = self.client.chat(
model=self.model, messages=messages, stream=True)
stream = self.stat_last_call_token_info(stream)
for chunk in stream:
tmp_content = chunk['message']['content']
logger.info(f'call ollama success, output: {tmp_content}')
Expand All @@ -40,6 +42,7 @@ def _chat_no_stream(self,
f'call ollama, model: {self.model}, messages: {str(messages)}, '
f'stop: {str(stop)}, stream: False, args: {str(kwargs)}')
response = self.client.chat(model=self.model, messages=messages)
self.stat_last_call_token_info(response)
final_content = response['message']['content']
logger.info(f'call ollama success, output: {final_content}')
return final_content
Expand Down Expand Up @@ -83,3 +86,28 @@ def chat(self,
messages = [{'role': 'user', 'content': prompt}]
return super().chat(
messages=messages, stop=stop, stream=stream, **kwargs)

def stat_last_call_token_info(self, response):
try:
self.last_call_usage_info = {
'prompt_tokens':
response.get('prompt_eval_count', -1),
'completion_tokens':
response.get('eval_count', -1),
'total_tokens':
response.get('prompt_eval_count') + response.get('eval_count')
}
return response
except AttributeError:
for chunk in response:
# if hasattr(chunk.output, 'usage'):
self.last_call_usage_info = {
'prompt_tokens':
chunk.get('prompt_eval_count', -1),
'completion_tokens':
chunk.get('eval_count', -1),
'total_tokens':
chunk.get('prompt_eval_count', -1)
+ chunk.get('eval_count', -1)
}
yield chunk
9 changes: 7 additions & 2 deletions modelscope_agent/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,21 @@ def _chat_stream(self,
logger.info(
f'call openai api, model: {self.model}, messages: {str(messages)}, '
f'stop: {str(stop)}, stream: True, args: {str(kwargs)}')
stream_options = {'include_usage': True}
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
stop=stop,
stream=True,
stream_options=stream_options,
**kwargs)
response = self.stat_last_call_token_info(response)
# TODO: error handling
for chunk in response:
# sometimes delta.content is None by vllm, we should not yield None
if hasattr(chunk.choices[0].delta,
'content') and chunk.choices[0].delta.content:
if len(chunk.choices) > 0 and hasattr(
chunk.choices[0].delta,
'content') and chunk.choices[0].delta.content:
logger.info(
f'call openai api success, output: {chunk.choices[0].delta.content}'
)
Expand All @@ -66,6 +70,7 @@ def _chat_no_stream(self,
stop=stop,
stream=False,
**kwargs)
self.stat_last_call_token_info(response)
logger.info(
f'call openai api success, output: {response.choices[0].message.content}'
)
Expand Down
159 changes: 71 additions & 88 deletions modelscope_agent_servers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,60 @@ cd modelscope-agent
# start the assistant server
sh scripts/run_assistant_server.sh

# start the assistant server with specified backend
sh scripts/run_assistant_server.sh dashscope
```

### Use case

#### Chat

We provide compatibility with parts of the OpenAI API `chat/completions`, especially function calls. The developers can use `OpenAI` SDK with specified local url. Currently the supported model server includes `dashscope`, `openai` and `ollama`.

To interact with the chat API, you should construct a object like `ChatRequest` on the client side, and then use the requests library to send it as the request body.
Here is an code snippet using `OpenAI` SDK with `dashscope` model server:

#### function calling
An example code snippet is as follows:
```Python
api_base = "http://localhost:31512/v1/"
model = 'qwen-max'

tools = [{
"type": "function",
"function": {
"name": "amap_weather",
"description": "amap weather tool",
"parameters": [{
"name": "location",
"type": "string",
"description": "城市/区具体名称,如`北京市海淀区`请描述为`海淀区`",
"required": True
}]
}
}]

tool_choice = 'auto'

client = OpenAI(
api_key="YOUR_DASHSCOPE_API_KEY",
base_url=api_base,
)
chat_completion = client.chat.completions.create(
messages=[{
"role": "user",
"content": "海淀区天气是什么?"
}],
model=model,
tools=tools,
tool_choice=tool_choice
)

```

You can also use `curl` to request this API.

```Shell
curl -X POST 'http://localhost:31512/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-d '{
"tools": [{
"type": "function",
Expand All @@ -68,108 +107,51 @@ curl -X POST 'http://localhost:31512/v1/chat/completions' \
}
}],
"tool_choice": "auto",
"llm_config": {
"model": "qwen-max",
"model_server": "dashscope",
"api_key": "YOUR DASHSCOPE API KEY"
},
"model": "qwen-max",
"messages": [
{"content": "海淀区天气", "role": "user"}
],
"uuid_str": "test",
"stream": false
]
}'

```

With above examples, the output should be like this:
```Python
{
"request_id":"xxxxx",
"message":"",
"output": None,
"choices": [{
"request_id":"xxx",
"id":"xxx",
"choices":[{
"index":0,
"message": {
"role": "assistant",
"content": "Action: amap_weather\nAction Input: {\"location\": \"海淀区\"}\n",
"tool_calls": [
{
"type": "function",
"function": {
"name": "amap_weather",
"arguments": "{\"location\":\"海淀区\"}"
}
}]
"message":{
"role":"assistant",
"content":"Action: amap_weather\nAction Input: {\"location\": \"海淀区\"}\n",
"tool_calls":[{
"type":"function",
"function":{
"name":"amap_weather",
"arguments":"{\"location\": \"海淀区\"}"
}
}]
},
"finish_reason": "tool_calls"
}]
}
"finish_reason":"tool_calls"
}],
"created":xxx,
"model":"qwen-max",
"object":"chat.completion",
"usage":{"prompt_tokens":267,"completion_tokens":15,"total_tokens":282}}
```

#### Assistant

To interact with the chat API, you should construct a object like `AgentRequest` on the client side, and then use the requests library to send it as the request body.

#### knowledge retrieval

To enable knowledge retrieval, you'll need to include use_knowledge and files in your configuration settings.
In `assistants/lite` API, to enable knowledge retrieval, you'll need to include use_knowledge and files in your configuration settings.

- `use_knowledge`: Specifies whether knowledge retrieval should be activated.
- `files`: the file(s) you wish to use during the conversation. By default, all previously uploaded files will be used.

```Shell
curl -X POST 'http://localhost:31512/v1/chat/completions' \
-H 'Content-Type: application/json' \
-d '{
"tools": [
{
"type": "function",
"function": {
"name": "amap_weather",
"description": "amap weather tool",
"parameters": [{
"name": "location",
"type": "string",
"description": "城市/区具体名称,如`北京市海淀区`请描述为`海淀区`",
"required": true
}]
}
}],
"llm_config": {
"model": "qwen-max",
"model_server": "dashscope",
"api_key": "YOUR DASHSCOPE API KEY"
},
"messages": [
{"content": "高德天气api申请", "role": "user"}
],
"uuid_str": "test",
"stream": false,
"use_knowledge": true,
"files": ["QA.pdf"]
}'
```

With above examples, the output should be like this:
```Python
{
"request_id":"2bdb05fb-48b6-4ba2-9a38-7c9eb7c5c88e",
"message":"",
"output": None,
"choices": [{
"index":0,
"message": {
"role": "assistant",
"content": "Information based on knowledge retrieval.",
}
"finish_reason": "stop"

}]
}
```

#### Assistant

Like `v1/chat/completions` API, you should construct a `ChatRequest` object when use `v1/assistants/lite`. Here is an example using python `requests` library.


```Python
import os
import requests
Expand All @@ -194,10 +176,11 @@ request = {
'agent_config': agent_cfg,
'llm_config': llm_cfg,
'messages': [
{'content': '请为我介绍一下modelscope', 'role': 'user'}
{'content': '高德天气API申请', 'role': 'user'}
],
'uuid_str': 'test',
'use_knowledge': True # whether to use knowledge
"files": ["QA.pdf"]
}

response = requests.post(url, json=request)
Expand All @@ -211,7 +194,7 @@ request = {
'agent_config': agent_cfg,
'llm_config': llm_cfg,
'messages': [
{'content': '请为我介绍一下modelscope', 'role': 'user'}
{'content': '高德天气API申请', 'role': 'user'}
],
'uuid_str': 'test',
'stream': True, # whether to use stream
Expand Down
Loading

0 comments on commit cc0e507

Please sign in to comment.