openai compatible chat api (#441)

modelscope · May 22, 2024 · cc0e507 · cc0e507
1 parent 0063302
commit cc0e507
Show file tree

Hide file tree

Showing 10 changed files with 350 additions and 187 deletions.
diff --git a/modelscope_agent/agents/role_play.py b/modelscope_agent/agents/role_play.py
@@ -153,7 +153,7 @@ def _run(self,
              lang: str = 'zh',
              **kwargs):
 
-        chat_mode = kwargs.get('chat_mode', False)
+        chat_mode = kwargs.pop('chat_mode', False)
         tools = kwargs.get('tools', None)
         tool_choice = kwargs.get('tool_choice', 'auto')
 

diff --git a/modelscope_agent/llm/base.py b/modelscope_agent/llm/base.py
@@ -45,6 +45,8 @@ def __init__(self,
         self.model_server = model_server
         self.max_length = 6000
 
+        self.last_call_usage_info = {}
+
     # It is okay to use the same code to handle the output
     # regardless of whether stream is True or False, as follows:
     # ```py
@@ -239,3 +241,16 @@ def check_max_length(self, messages: Union[List[Dict], str]) -> bool:
 
     def get_max_length(self) -> int:
         return self.max_length
+
+    def get_usage(self) -> Dict:
+        return self.last_call_usage_info
+
+    def stat_last_call_token_info(self, response):
+        try:
+            self.last_call_usage_info = response.usage.dict()
+            return response
+        except AttributeError:
+            for chunk in response:
+                if hasattr(chunk, 'usage') and chunk.usage is not None:
+                    self.last_call_usage_info = chunk.usage.dict()
+                yield chunk
diff --git a/modelscope_agent/llm/dashscope.py b/modelscope_agent/llm/dashscope.py
@@ -101,6 +101,8 @@ def _chat_stream(self,
         if kwargs.get('seed', None):
             generation_input['seed'] = kwargs.get('seed')
         response = dashscope.Generation.call(**generation_input)
+        print(response)
+        response = self.stat_last_call_token_info(response)
         return stream_output(response, **kwargs)
 
     def _chat_no_stream(self,
@@ -119,6 +121,7 @@ def _chat_no_stream(self,
             top_p=top_p,
         )
         if response.status_code == HTTPStatus.OK:
+            self.stat_last_call_token_info(response)
             return response.output.choices[0].message.content
         else:
             err = 'Error code: %s, error message: %s' % (
@@ -127,6 +130,24 @@ def _chat_no_stream(self,
             )
             return err
 
+    def stat_last_call_token_info(self, response):
+        try:
+            self.last_call_usage_info = {
+                'prompt_tokens': response.usage.input_tokens,
+                'completion_tokens': response.usage.output_tokens,
+                'total_tokens': response.usage.total_tokens
+            }
+            return response
+        except AttributeError:
+            for chunk in response:
+                # if hasattr(chunk.output, 'usage'):
+                self.last_call_usage_info = {
+                    'prompt_tokens': chunk.usage.input_tokens,
+                    'completion_tokens': chunk.usage.output_tokens,
+                    'total_tokens': chunk.usage.total_tokens
+                }
+                yield chunk
+
 
 @register_llm('dashscope_qwen')
 @register_llm('dashscope_qwen1.5')

diff --git a/modelscope_agent/llm/ollama.py b/modelscope_agent/llm/ollama.py
@@ -15,6 +15,7 @@ def __init__(self, model: str, model_server: str, **kwargs):
         host = kwargs.get('host', 'http://localhost:11434')
         self.client = ollama.Client(host=host)
         self.model = model
+        self.client.pull(self.model)
 
     def _chat_stream(self,
                      messages: List[Dict],
@@ -25,6 +26,7 @@ def _chat_stream(self,
             f'stop: {str(stop)}, stream: True, args: {str(kwargs)}')
         stream = self.client.chat(
             model=self.model, messages=messages, stream=True)
+        stream = self.stat_last_call_token_info(stream)
         for chunk in stream:
             tmp_content = chunk['message']['content']
             logger.info(f'call ollama success, output: {tmp_content}')
@@ -40,6 +42,7 @@ def _chat_no_stream(self,
             f'call ollama, model: {self.model}, messages: {str(messages)}, '
             f'stop: {str(stop)}, stream: False, args: {str(kwargs)}')
         response = self.client.chat(model=self.model, messages=messages)
+        self.stat_last_call_token_info(response)
         final_content = response['message']['content']
         logger.info(f'call ollama success, output: {final_content}')
         return final_content
@@ -83,3 +86,28 @@ def chat(self,
             messages = [{'role': 'user', 'content': prompt}]
         return super().chat(
             messages=messages, stop=stop, stream=stream, **kwargs)
+
+    def stat_last_call_token_info(self, response):
+        try:
+            self.last_call_usage_info = {
+                'prompt_tokens':
+                response.get('prompt_eval_count', -1),
+                'completion_tokens':
+                response.get('eval_count', -1),
+                'total_tokens':
+                response.get('prompt_eval_count') + response.get('eval_count')
+            }
+            return response
+        except AttributeError:
+            for chunk in response:
+                # if hasattr(chunk.output, 'usage'):
+                self.last_call_usage_info = {
+                    'prompt_tokens':
+                    chunk.get('prompt_eval_count', -1),
+                    'completion_tokens':
+                    chunk.get('eval_count', -1),
+                    'total_tokens':
+                    chunk.get('prompt_eval_count', -1)
+                    + chunk.get('eval_count', -1)
+                }
+                yield chunk
diff --git a/modelscope_agent/llm/openai.py b/modelscope_agent/llm/openai.py
@@ -36,17 +36,21 @@ def _chat_stream(self,
         logger.info(
             f'call openai api, model: {self.model}, messages: {str(messages)}, '
             f'stop: {str(stop)}, stream: True, args: {str(kwargs)}')
+        stream_options = {'include_usage': True}
         response = self.client.chat.completions.create(
             model=self.model,
             messages=messages,
             stop=stop,
             stream=True,
+            stream_options=stream_options,
             **kwargs)
+        response = self.stat_last_call_token_info(response)
         # TODO: error handling
         for chunk in response:
             # sometimes delta.content is None by vllm, we should not yield None
-            if hasattr(chunk.choices[0].delta,
-                       'content') and chunk.choices[0].delta.content:
+            if len(chunk.choices) > 0 and hasattr(
+                    chunk.choices[0].delta,
+                    'content') and chunk.choices[0].delta.content:
                 logger.info(
                     f'call openai api success, output: {chunk.choices[0].delta.content}'
                 )
@@ -66,6 +70,7 @@ def _chat_no_stream(self,
             stop=stop,
             stream=False,
             **kwargs)
+        self.stat_last_call_token_info(response)
         logger.info(
             f'call openai api success, output: {response.choices[0].message.content}'
         )

diff --git a/modelscope_agent_servers/README.md b/modelscope_agent_servers/README.md
@@ -38,21 +38,60 @@ cd modelscope-agent
 # start the assistant server
 sh scripts/run_assistant_server.sh
 
+# start the assistant server with specified backend
+sh scripts/run_assistant_server.sh dashscope
 ```
 
 ### Use case
 
 #### Chat
 
+We provide compatibility with parts of the OpenAI API `chat/completions`, especially function calls. The developers can use `OpenAI` SDK with specified local url. Currently the supported model server includes `dashscope`, `openai` and `ollama`.
 
-To interact with the chat API, you should construct a object like `ChatRequest` on the client side, and then use the requests library to send it as the request body.
+Here is an code snippet using `OpenAI` SDK with `dashscope` model server:
 
-#### function calling
-An example code snippet is as follows:
+```Python
+api_base = "http://localhost:31512/v1/"
+model = 'qwen-max'
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "amap_weather",
+        "description": "amap weather tool",
+        "parameters": [{
+            "name": "location",
+            "type": "string",
+            "description": "城市/区具体名称，如`北京市海淀区`请描述为`海淀区`",
+            "required": True
+        }]
+    }
+}]
+
+tool_choice = 'auto'
+
+client = OpenAI(
+    api_key="YOUR_DASHSCOPE_API_KEY",
+    base_url=api_base,
+)
+chat_completion = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": "海淀区天气是什么？"
+    }],
+    model=model,
+    tools=tools,
+    tool_choice=tool_choice
+)
+
+```
+
+You can also use `curl` to request this API.
 
 ```Shell
 curl -X POST 'http://localhost:31512/v1/chat/completions' \
 -H 'Content-Type: application/json' \
+-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
 -d '{
     "tools": [{
         "type": "function",
@@ -68,108 +107,51 @@ curl -X POST 'http://localhost:31512/v1/chat/completions' \
         }
     }],
     "tool_choice": "auto",
-    "llm_config": {
-        "model": "qwen-max",
-        "model_server": "dashscope",
-        "api_key": "YOUR DASHSCOPE API KEY"
-    },
+    "model": "qwen-max",
     "messages": [
         {"content": "海淀区天气", "role": "user"}
-    ],
-    "uuid_str": "test",
-    "stream": false
+    ]
 }'
 
 ```
 
 With above examples, the output should be like this:
 ```Python
 {
-    "request_id":"xxxxx",
-    "message":"",
-    "output": None,
-    "choices": [{
+    "request_id":"xxx",
+    "id":"xxx",
+    "choices":[{
         "index":0,
-        "message": {
-            "role": "assistant",
-            "content": "Action: amap_weather\nAction Input: {\"location\": \"海淀区\"}\n",
-            "tool_calls": [
-                {
-                    "type": "function",
-                    "function": {
-                        "name": "amap_weather",
-                        "arguments": "{\"location\":\"海淀区\"}"
-                }
-            }]
+        "message":{
+            "role":"assistant",
+            "content":"Action: amap_weather\nAction Input: {\"location\": \"海淀区\"}\n",
+            "tool_calls":[{
+                    "type":"function",
+                    "function":{
+                        "name":"amap_weather",
+                        "arguments":"{\"location\": \"海淀区\"}"
+                    }
+                }]
         },
-        "finish_reason": "tool_calls"
-    }]
-}
+        "finish_reason":"tool_calls"
+        }],
+        "created":xxx,
+        "model":"qwen-max",
+        "object":"chat.completion",
+        "usage":{"prompt_tokens":267,"completion_tokens":15,"total_tokens":282}}
 ```
 
+#### Assistant
+
+To interact with the chat API, you should construct a object like `AgentRequest` on the client side, and then use the requests library to send it as the request body.
+
 #### knowledge retrieval
 
-To enable knowledge retrieval, you'll need to include use_knowledge and files in your configuration settings.
+In `assistants/lite` API, to enable knowledge retrieval, you'll need to include use_knowledge and files in your configuration settings.
 
 - `use_knowledge`: Specifies whether knowledge retrieval should be activated.
 - `files`: the file(s) you wish to use during the conversation. By default, all previously uploaded files will be used.
 
-```Shell
-curl -X POST 'http://localhost:31512/v1/chat/completions' \
--H 'Content-Type: application/json' \
--d '{
-    "tools": [
-    {
-        "type": "function",
-        "function": {
-            "name": "amap_weather",
-            "description": "amap weather tool",
-            "parameters": [{
-                "name": "location",
-                "type": "string",
-                "description": "城市/区具体名称，如`北京市海淀区`请描述为`海淀区`",
-                "required": true
-            }]
-        }
-    }],
-    "llm_config": {
-        "model": "qwen-max",
-        "model_server": "dashscope",
-        "api_key": "YOUR DASHSCOPE API KEY"
-    },
-    "messages": [
-        {"content": "高德天气api申请", "role": "user"}
-    ],
-    "uuid_str": "test",
-    "stream": false,
-    "use_knowledge": true,
-    "files": ["QA.pdf"]
-}'
-```
-
-With above examples, the output should be like this:
-```Python
-{
-    "request_id":"2bdb05fb-48b6-4ba2-9a38-7c9eb7c5c88e",
-    "message":"",
-    "output": None,
-    "choices": [{
-        "index":0,
-        "message": {
-            "role": "assistant",
-            "content": "Information based on knowledge retrieval.",
-        }
-        "finish_reason": "stop"
-
-    }]
-}
-```
-
-#### Assistant
-
-Like `v1/chat/completions` API, you should construct a `ChatRequest` object when use `v1/assistants/lite`. Here is an example using python `requests` library.
-
-
 ```Python
 import os
 import requests
@@ -194,10 +176,11 @@ request = {
     'agent_config': agent_cfg,
     'llm_config': llm_cfg,
     'messages': [
-        {'content': '请为我介绍一下modelscope', 'role': 'user'}
+        {'content': '高德天气API申请', 'role': 'user'}
     ],
     'uuid_str': 'test',
     'use_knowledge': True # whether to use knowledge
+    "files": ["QA.pdf"]
 }
 
 response = requests.post(url, json=request)
@@ -211,7 +194,7 @@ request = {
     'agent_config': agent_cfg,
     'llm_config': llm_cfg,
     'messages': [
-        {'content': '请为我介绍一下modelscope', 'role': 'user'}
+        {'content': '高德天气API申请', 'role': 'user'}
     ],
     'uuid_str': 'test',
     'stream': True, # whether to use stream