Merge branch 'master' into master

pytorch · Sep 10, 2024 · 6c4232d · 6c4232d
2 parents ccdb326 + 87c9823
commit 6c4232d
Show file tree

Hide file tree

Showing 40 changed files with 838 additions and 256 deletions.
diff --git a/.github/workflows/ci_graviton_cpu.yml b/.github/workflows/ci_graviton_cpu.yml
@@ -0,0 +1,48 @@
+name: CI CPU Graviton
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  merge_group:
+
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ci-cpu:
+    runs-on: [self-hosted, graviton-test]
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          architecture: arm64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Install dependencies
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev
+      - name: Torchserve Sanity
+        uses: nick-fields/retry@v3
+        env:
+          TS_MAC_ARM64_CPU_ONLY: 'True'
+        with:
+          timeout_minutes: 60
+          max_attempts: 3
+          retry_on: error
+          command: |
+            python torchserve_sanity.py
diff --git a/.github/workflows/docker-ci.yaml b/.github/workflows/docker-ci.yaml
@@ -39,7 +39,11 @@ jobs:
         working-directory: docker
         run: |
           IMAGE_TAG=test-image-${{ matrix.python-version }}
-          ./build_image.sh -py "${{ matrix.python-version }}" -t "${IMAGE_TAG}" -b ${{ steps.branch-name.outputs.GITHUB_BRANCH }} -repo ${{ github.repositoryUrl }} -s
+          REPO_URL="${{ github.event.pull_request.head.repo.clone_url }}"
+          if [[ -z "${REPO_URL}" ]]; then
+            REPO_URL="https://github.com/pytorch/serve.git"
+          fi
+          ./build_image.sh -py "${{ matrix.python-version }}" -t "${IMAGE_TAG}" -b "${{ steps.branch-name.outputs.GITHUB_BRANCH }}" -repo ${REPO_URL} -s
           echo "IMAGE_TAG=${IMAGE_TAG}" >> $GITHUB_OUTPUT
 
       - name: Container Healthcheck

diff --git a/.github/workflows/regression_tests_graviton_cpu.yml b/.github/workflows/regression_tests_graviton_cpu.yml
@@ -0,0 +1,41 @@
+name: Run Regression Tests on CPU for Graviton
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  merge_group:
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  regression-cpu:
+    runs-on: [self-hosted, graviton-test]
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          architecture: arm64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Install dependencies
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev
+      - name: Torchserve Regression Tests
+        env:
+          TS_MAC_ARM64_CPU_ONLY: 'True'
+        run: |
+          python test/regression_tests.py
diff --git a/README.md b/README.md
@@ -62,13 +62,24 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 ### 🤖 Quick Start LLM Deployment
 
+```bash
+# Make sure to install torchserve with pip or conda as described above and login with `huggingface-cli login`
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
+
+# Try it out
+curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+```
+
+### 🚢 Quick Start LLM Deployment with Docker
+
 ```bash
 #export token=<HUGGINGFACE_HUB_TOKEN>
 docker build --pull . -f docker/Dockerfile.llm -t ts/llm
 
 docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
 
-curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
+# Try it out
+curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
 ```
 
 Refer to [LLM deployment](docs/llm_deployment.md) for details and other methods.

diff --git a/docs/contents.rst b/docs/contents.rst
@@ -18,6 +18,7 @@
   server
   nvidia_mps
   snapshot
+  intel_extension_for_pytorch <https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch>
   torchserve_on_win_native
   torchserve_on_wsl
   use_cases

diff --git a/docs/index.rst b/docs/index.rst
@@ -19,6 +19,7 @@ What's going on in TorchServe?
 * `Walmart Search: Serving Models at a Scale on TorchServe <https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d>`__
 * `Scaling inference on CPU with TorchServe <https://www.youtube.com/watch?v=066_Jd6cwZg>`__
 * `TorchServe C++ backend <https://www.youtube.com/watch?v=OSmGGDpaesc>`__
+* `TorchServe with Intel® Extension for PyTorch* <https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch>`__
 * `Grokking Intel CPU PyTorch performance from first principles: a TorchServe case study <https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html>`__
 * `Grokking Intel CPU PyTorch performance from first principles( Part 2): a TorchServe case study <https://pytorch.org/tutorials/intermediate/torchserve_with_ipex_2.html>`__
 * `Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing <https://pytorch.org/blog/amazon-ads-case-study/>`__

diff --git a/examples/Huggingface_Transformers/Transformer_handler_generalized.py b/examples/Huggingface_Transformers/Transformer_handler_generalized.py
@@ -154,7 +154,7 @@ def initialize(self, ctx):
         self.initialized = True
 
     def preprocess(self, requests):
-        """Basic text preprocessing, based on the user's chocie of application mode.
+        """Basic text preprocessing, based on the user's choice of application mode.
         Args:
             requests (str): The Input data in the form of text is passed on to the preprocess
             function.
@@ -193,14 +193,14 @@ def preprocess(self, requests):
 
             # preprocessing text for question_answering.
             elif self.setup_config["mode"] == "question_answering":
-                # TODO Reading the context from a pickeled file or other fromats that
+                # TODO Reading the context from a pickled file or other formats that
                 # fits the requirements of the task in hand. If this is done then need to
                 # modify the following preprocessing accordingly.
 
                 # the sample text for question_answering in the current version
-                # should be formated as dictionary with question and text as keys
+                # should be formatted as dictionary with question and text as keys
                 # and related text as values.
-                # we use this format here seperate question and text for encoding.
+                # we use this format here separate question and text for encoding.
 
                 question_context = ast.literal_eval(input_text)
                 question = question_context["question"]
@@ -215,7 +215,7 @@ def preprocess(self, requests):
                 )
             input_ids = inputs["input_ids"].to(self.device)
             attention_mask = inputs["attention_mask"].to(self.device)
-            # making a batch out of the recieved requests
+            # making a batch out of the received requests
             # attention masks are passed for cases where input tokens are padded.
             if input_ids.shape is not None:
                 if input_ids_batch is None:
@@ -486,7 +486,7 @@ def captum_sequence_forward(inputs, attention_mask=None, position=0, model=None)
 
 
 def summarize_attributions(attributions):
-    """Summarises the attribution across multiple runs
+    """Summarizes the attribution across multiple runs
     Args:
         attributions ([list): attributions from the Layer Integrated Gradients
     Returns:

diff --git a/examples/large_models/utils/test_llm_streaming_response.py b/examples/large_models/utils/test_llm_streaming_response.py
@@ -27,25 +27,67 @@ def _predict(self):
             combined_text = ""
             for chunk in response.iter_content(chunk_size=None):
                 if chunk:
-                    data = json.loads(chunk)
+                    text = self._extract_text(chunk)
                     if self.args.demo_streaming:
-                        print(data["text"], end="", flush=True)
+                        print(text, end="", flush=True)
                     else:
-                        combined_text += data.get("text", "")
+                        combined_text += text
         if not self.args.demo_streaming:
             self.queue.put_nowait(f"payload={payload}\n, output={combined_text}\n")
 
+    def _extract_completion(self, chunk):
+        chunk = chunk.decode("utf-8")
+        if chunk.startswith("data:"):
+            chunk = chunk[len("data:") :].split("\n")[0].strip()
+            if chunk.startswith("[DONE]"):
+                return ""
+        return json.loads(chunk)["choices"][0]["text"]
+
+    def _extract_chat(self, chunk):
+        chunk = chunk.decode("utf-8")
+        if chunk.startswith("data:"):
+            chunk = chunk[len("data:") :].split("\n")[0].strip()
+            if chunk.startswith("[DONE]"):
+                return ""
+        try:
+            return json.loads(chunk)["choices"][0].get("message", {})["content"]
+        except KeyError:
+            return json.loads(chunk)["choices"][0].get("delta", {}).get("content", "")
+
+    def _extract_text(self, chunk):
+        if self.args.openai_api:
+            if "chat" in self.args.api_endpoint:
+                return self._extract_chat(chunk)
+            else:
+                return self._extract_completion(chunk)
+        else:
+            return json.loads(chunk).get("text", "")
+
     def _get_url(self):
-        return f"http://localhost:8080/predictions/{self.args.model}"
+        if self.args.openai_api:
+            return f"http://localhost:8080/predictions/{self.args.model}/{self.args.model_version}/{self.args.api_endpoint}"
+        else:
+            return f"http://localhost:8080/predictions/{self.args.model}"
 
     def _format_payload(self):
         prompt_input = _load_curl_like_data(self.args.prompt_text)
+        if "chat" in self.args.api_endpoint:
+            assert self.args.prompt_json, "Use prompt json file for chat interface"
+            assert self.args.openai_api, "Chat only work with openai api"
+            prompt_input = json.loads(prompt_input)
+            messages = prompt_input.get("messages", None)
+            assert messages is not None
+            rt = int(prompt_input.get("max_tokens", self.args.max_tokens))
+            prompt_input["max_tokens"] = rt
+            if self.args.demo_streaming:
+                prompt_input["stream"] = True
+            return prompt_input
         if self.args.prompt_json:
             prompt_input = json.loads(prompt_input)
             prompt = prompt_input.get("prompt", None)
             assert prompt is not None
             prompt_list = prompt.split(" ")
-            rt = int(prompt_input.get("max_new_tokens", self.args.max_tokens))
+            rt = int(prompt_input.get("max_tokens", self.args.max_tokens))
         else:
             prompt_list = prompt_input.split(" ")
             rt = self.args.max_tokens
@@ -58,13 +100,15 @@ def _format_payload(self):
         cur_prompt = " ".join(prompt_list)
         if self.args.prompt_json:
             prompt_input["prompt"] = cur_prompt
-            prompt_input["max_new_tokens"] = rt
-            return prompt_input
+            prompt_input["max_tokens"] = rt
         else:
-            return {
+            prompt_input = {
                 "prompt": cur_prompt,
-                "max_new_tokens": rt,
+                "max_tokens": rt,
             }
+        if self.args.demo_streaming and self.args.openai_api:
+            prompt_input["stream"] = True
+        return prompt_input
 
 
 def _load_curl_like_data(text):
@@ -136,6 +180,24 @@ def parse_args():
         default=False,
         help="Demo streaming response, force num-requests-per-thread=1 and num-threads=1",
     )
+    parser.add_argument(
+        "--openai-api",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Use OpenAI compatible API",
+    )
+    parser.add_argument(
+        "--api-endpoint",
+        type=str,
+        default="v1/completions",
+        help="OpenAI endpoint suffix",
+    )
+    parser.add_argument(
+        "--model-version",
+        type=str,
+        default="1.0",
+        help="Model vesion. Default: 1.0",
+    )
 
     return parser.parse_args()
 

diff --git a/examples/large_models/vllm/llama3/Readme.md b/examples/large_models/vllm/llama3/Readme.md
@@ -1,6 +1,6 @@
 # Example showing inference with vLLM on LoRA model
 
-This is an example showing how to integrate [vLLM](https://github.com/vllm-project/vllm) with TorchServe and run inference on model `meta-llama/Meta-Llama-3-8B-Instruct` with continuous batching.
+This is an example showing how to integrate [vLLM](https://github.com/vllm-project/vllm) with TorchServe and run inference on model `meta-llama/Meta-Llama-3.1-8B-Instruct` with continuous batching.
 This examples supports distributed inference by following [this instruction](../Readme.md#distributed-inference)
 
 ### Step 0: Install vLLM
@@ -21,7 +21,7 @@ huggingface-cli login --token $HUGGINGFACE_TOKEN
 ```
 
 ```bash
-python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3-8B-Instruct --use_auth_token True
+python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3.1-8B-Instruct --use_auth_token True
 ```
 
 ### Step 2: Generate model artifacts
@@ -47,7 +47,12 @@ torchserve --start --ncs --ts-config ../config.properties --model-store model_st
 ```
 
 ### Step 5: Run inference
+Run a text completion:
+```bash
+python ../../utils/test_llm_streaming_response.py -m llama3-8b -o 50 -t 2 -n 4 --prompt-text "@prompt.json" --prompt-json --openai-api
+```
 
+Or use the chat interface:
 ```bash
-python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 --prompt-text "@prompt.json" --prompt-json
+python ../../utils/test_llm_streaming_response.py -m llama3-8b -o 50 -t 2 -n 4 --prompt-text "@chat.json" --prompt-json --openai-api --demo-streaming --api-endpoint "v1/chat/completions"
 ```
diff --git a/examples/large_models/vllm/llama3/chat.json b/examples/large_models/vllm/llama3/chat.json
@@ -0,0 +1,11 @@
+{
+    "model": "llama3-8b",
+    "messages":[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Who won the world series in 2020?"},
+        {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
+        {"role": "user", "content": "Where was it played?"}
+    ],
+    "temperature":0.0,
+    "max_tokens": 50
+}
diff --git a/examples/large_models/vllm/llama3/model-config.yaml b/examples/large_models/vllm/llama3/model-config.yaml
@@ -7,7 +7,10 @@ deviceType: "gpu"
 asyncCommunication: true
 
 handler:
-    model_path: "model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/"
+    model_path: "model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/8c22764a7e3675c50d4c7c9a4edb474456022b16"
     vllm_engine_config:
         max_num_seqs: 16
         max_model_len: 250
+        served_model_name:
+            - "meta-llama/Meta-Llama-3.1-8B"
+            - "llama3-8b"
diff --git a/examples/large_models/vllm/llama3/prompt.json b/examples/large_models/vllm/llama3/prompt.json
@@ -1,9 +1,7 @@
 {
   "prompt": "A robot may not injure a human being",
-  "max_new_tokens": 50,
   "temperature": 0.8,
   "logprobs": 1,
-  "prompt_logprobs": 1,
   "max_tokens": 128,
-  "adapter": "adapter_1"
+  "model": "llama3-8b"
 }