pytorch · agunapal · Jun 25, 2024 · May 1, 2024 · May 2, 2024 · May 2, 2024
diff --git a/examples/intel_extension_for_pytorch/README.md b/examples/intel_extension_for_pytorch/README.md
@@ -9,6 +9,8 @@ Here we show how to use TorchServe with Intel® Extension for PyTorch*.
 * [Install Intel® Extension for PyTorch*](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md#install-intel-extension-for-pytorch)
 * [Serving model with Intel® Extension for PyTorch*](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md#serving-model-with-intel-extension-for-pytorch)
 * [TorchServe with Launcher](#torchserve-with-launcher)
+* [TorchServe with Intel® Extension for PyTorch* and Intel GPUs](#torchserve-with-intel®-extension-for-pytorch-and-intel-gpus)
+* [Performance Gain with Intel® Extension for PyTorch* and Intel GPU](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md#performance-gain-with-intel-extension-for-pytorch-and-intel-gpu)
 * [Creating and Exporting INT8 model for Intel® Extension for PyTorch*](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md#creating-and-exporting-int8-model-for-intel-extension-for-pytorch)
 * [Benchmarking with Launcher](#benchmarking-with-launcher)
 * [Performance Boost with Intel® Extension for PyTorch* and Launcher](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md#performance-boost-with-intel-extension-for-pytorch-and-launcher)
@@ -73,6 +75,7 @@ CPU usage is shown below. 4 main worker threads were launched, each launching 14
 ![26](https://user-images.githubusercontent.com/93151422/170373651-fd8a0363-febf-4528-bbae-e1ddef119358.gif)
 
 
+
 #### Scaling workers
 Additionally when dynamically [scaling the number of workers](https://pytorch.org/serve/management_api.html#scale-workers), cores that were pinned to killed workers by the launcher could be left unutilized. To address this problem, launcher internally restarts the workers to re-distribute cores that were pinned to killed workers to the remaining, alive workers. This is taken care internally, so users do not have to worry about this. 
 
@@ -90,6 +93,93 @@ Add the following lines in `config.properties` to use launcher with its default
 cpu_launcher_enable=true
 ```
 
+## TorchServe with Intel® Extension for PyTorch* and Intel GPUs
+
+TorchServe can also leverage Intel GPU for acceleration, providing additional performance benefits. To use TorchServe with Intel GPU, the machine must have the latest oneAPI Base Kit installed, activated, and ipex GPU installed.
+
+
+### Installation and Setup for Intel GPU Support
+**Install Intel oneAPI Base Kit:** 
+Follow the installation instructions for your operating system from the [Intel oneAPI Basekit Installation](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.htm).
+
+**Install the ipex GPU package to enable TorchServe to utilize Intel GPU for acceleration:** 
+Follow the installation instructions for your operating system from the [ Intel® Extension for PyTorch* XPU/GPU Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu).
+
+**Activate the Intel oneAPI Base Kit:** 
+Activate the Intel oneAPI Base Kit using the following command:
+   ```bash
+   source /path/to/oneapi/setvars.sh
+   ```
+
+**Install xpu-smi:**
+Install xpu-smi to let torchserve detect the number of Intel GPU devices present. xpu-smi provides information about the Intel GPU, including temperature, utilization, and other metrics.[xpu-smi Installation Guide](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-package-repository)
+
+**Enable Intel GPU Support in TorchServe:** 
+To enable TorchServe to use Intel GPUs, set the following configuration in `config.properties`:
+   ```
+   ipex_enable=true
+   ipex_gpu_enable=true
+   ```
+To enable metric reading, additionally set:
+   ```
+   system_metrics_cmd=${PATH to examples/intel_extension_for_pytorch/intel_gpu_metric_collector.py} --gpu ${Number of GPUs}
+   ```
+
+## Performance Gain with Intel® Extension for PyTorch* and Intel GPU
+
+To understand the performance gain using Intel GPU, Torchserve recommended [apache benchmark](https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench) is executed on FastRCNN FP32 model.
+
+A `model_config.json` file is created, and the following parameters are added:
+
+```
+{
+  "url": "https://torchserve.pytorch.org/mar_files/fastrcnn.mar",
+  "requests": "10000",
+  "concurrency": "100",
+  "workers": "1",
+  "batch_delay": "100",
+  "batch_size": "1",
+  "input": "../examples/image_classifier/kitten.jpg",
+  "backend_profiling": "FALSE",
+  "exec_env": "local"
+}
+```
+
+Batch size can be changed according to the requirement.
+
+Following lines are added to the `config.properties` to utilize IPEX and Intel GPU:
+
+```
+ipex_enable=true
+ipex_gpu_enable=true
+```
+
+To reproduce the test, use the following command:
+
+```
+python benchmark-ab.py --config model_config.json --config_properties config.properties
+```
+
+This test is performed on a server containing Intel(R) Core (TM) i5-9600K CPU + Intel(R) Arc(TM) A770 Graphics and is compared with a Intel(R) Xeon(R) Gold 6438Y CPU server.
+It is recommended to use only 1 worker per GPU, more than 1 worker per GPU is not validated and may cause performance degradation due to context switching.
+
+
+| Model | Batch size | CPU Throughput(img/sec) | GPU Throughput(img/sec) | CPU TS Latency mean(ms) | GPU TS Latency mean(ms) | Throughput speedup ratio | Latency speedup ratio |
+|:-----:|:----------:|:--------------:|:--------------:|:-------------------:|:-------------------:|:-------------------------:|:----------------------:|
+| FastRCNN_FP32 | 1 | 15.74 | 2.89 | 6352.388 | 34636.68 | 5.45 | 5.45 |
+|  | 2 | 17.69 | 2.67 | 5651.999 | 37520.781 | 6.63 | 6.64 |
+|  | 4 | 18.57 | 2.39 | 5385.389 | 41886.902 | 7.77 | 7.78 |
+|  | 8 | 18.68 | 2.32 | 5354.58 | 43146.797 | 8.05 | 8.06 |
+|  | 16 | 19.26 | 2.39 | 5193.307 | 41903.752 | 8.06 | 8.07 |
+|  | 32 | 19.06 | 2.49 | 5245.912 | 40172.39 | 7.65 | 7.66 |
+
+<p align="center">
+  <img src="https://github.com/pytorch/serve/assets/113945574/c30aeacc-9825-42b1-bde8-2d9dca09bb8a" />
+</p>
+Above graph shows the speedup ratio of throughput and latency while using Intel GPU. The speedup ratio is increasing steadily reaching almost 8x till batch size 8 and gives diminishing returns after. Further increasing the batch size to 64 results in `RuntimeError: Native API failed. Native API returns: -5 (PI_ERROR_OUT_OF_RESOURCES)` error as GPU is overloaded.
+
+Note: The optimal configuration will vary depending on the hardware used.
+
 ## Creating and Exporting INT8 model for Intel® Extension for PyTorch*
 Intel® Extension for PyTorch* supports both eager and torchscript mode. In this section, we show how to deploy INT8 model for Intel® Extension for PyTorch*. Refer to [here](https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/features/int8_overview.md) for more details on Intel® Extension for PyTorch* optimizations for quantization.
 

diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
@@ -79,6 +79,7 @@ public final class ConfigManager {
     private static final String TS_IPEX_ENABLE = "ipex_enable";
     private static final String TS_CPU_LAUNCHER_ENABLE = "cpu_launcher_enable";
     private static final String TS_CPU_LAUNCHER_ARGS = "cpu_launcher_args";
+    private static final String TS_IPEX_GPU_ENABLE = "ipex_gpu_enable";
 
     private static final String TS_ASYNC_LOGGING = "async_logging";
     private static final String TS_CORS_ALLOWED_ORIGIN = "cors_allowed_origin";
@@ -161,7 +162,7 @@ public final class ConfigManager {
     private static Pattern pattern = Pattern.compile("\\$\\$([^$]+[^$])\\$\\$");
 
     private Pattern blacklistPattern;
-    private Properties prop;
+    private Properties prop; 
 
     private boolean snapshotDisabled;
 
@@ -272,6 +273,7 @@ private ConfigManager(Arguments args) throws IOException {
                                 getAvailableGpu(),
                                 getIntProperty(TS_NUMBER_OF_GPU, Integer.MAX_VALUE))));
 
+
         String pythonExecutable = args.getPythonExecutable();
         if (pythonExecutable != null) {
             prop.setProperty(PYTHON_EXECUTABLE, pythonExecutable);
@@ -473,6 +475,10 @@ public String getCPULauncherArgs() {
         return getProperty(TS_CPU_LAUNCHER_ARGS, null);
     }
 
+    public boolean isIPEXGpuEnabled() {
+        return Boolean.parseBoolean(getProperty(TS_IPEX_GPU_ENABLE, "false"));
+    }
+
     public boolean getDisableTokenAuthorization() {
         return Boolean.parseBoolean(getProperty(TS_DISABLE_TOKEN_AUTHORIZATION, "false"));
     }
@@ -490,8 +496,9 @@ public int getJobQueueSize() {
     }
 
     public int getNumberOfGpu() {
+        // return 1;
         return getIntProperty(TS_NUMBER_OF_GPU, 0);
-    }
+    } 
 
     public boolean getModelControlMode() {
         return Boolean.parseBoolean(getProperty(MODEL_CONTROL_MODE, "false"));
@@ -647,7 +654,7 @@ public String getCertificateFile() {
     public String getSystemMetricsCmd() {
         return prop.getProperty(SYSTEM_METRICS_CMD, "");
     }
-
+ 
     public SslContext getSslContext() throws IOException, GeneralSecurityException {
         List<String> supportedCiphers =
                 Arrays.asList(
@@ -902,6 +909,7 @@ public HashMap<String, String> getBackendConfiguration() {
         // Append properties used by backend worker here
         config.put("TS_DECODE_INPUT_REQUEST", prop.getProperty(TS_DECODE_INPUT_REQUEST, "true"));
         config.put("TS_IPEX_ENABLE", prop.getProperty(TS_IPEX_ENABLE, "false"));
+        config.put("TS_IPEX_GPU_ENABLE", prop.getProperty(TS_IPEX_GPU_ENABLE, "false"));
         return config;
     }
 
@@ -922,6 +930,7 @@ private static String getCanonicalPath(String path) {
 
     private static int getAvailableGpu() {
         try {
+
             List<Integer> gpuIds = new ArrayList<>();
             String visibleCuda = System.getenv("CUDA_VISIBLE_DEVICES");
             if (visibleCuda != null && !visibleCuda.isEmpty()) {
@@ -953,7 +962,10 @@ private static int getAvailableGpu() {
                 // No MPS devices detected
                 return 0;
             } else {
-                Process process =
+
+
+                try {
+                    Process process =
                         Runtime.getRuntime().exec("nvidia-smi --query-gpu=index --format=csv");
                 int ret = process.waitFor();
                 if (ret != 0) {
@@ -967,13 +979,35 @@ private static int getAvailableGpu() {
                 for (int i = 1; i < list.size(); i++) {
                     gpuIds.add(Integer.parseInt(list.get(i)));
                 }
-            }
+                }catch (IOException | InterruptedException e) {
+                System.out.println("nvidia-smi not available or failed: " + e.getMessage());
+                }
+                try {
+                Process process = Runtime.getRuntime().exec("xpu-smi discovery --dump 1");
+                int ret = process.waitFor();
+                    if (ret != 0) {
+                        return 0;
+                    }
+                List<String> list =
+                        IOUtils.readLines(process.getInputStream(), StandardCharsets.UTF_8);
+                if (list.isEmpty() || !list.get(0).contains("Device ID")) {
+                    throw new AssertionError("Unexpected xpu-smi response.");
+                }
+                for (int i = 1; i < list.size(); i++) {
+                    gpuIds.add(Integer.parseInt(list.get(i)));
+                }
+                }catch (IOException | InterruptedException e) {
+                System.out.println("xpu-smi not available or failed: " + e.getMessage());
+                }
 
+
+
+            }
             return gpuIds.size();
         } catch (IOException | InterruptedException e) {
             return 0;
         }
-    }
+        }
 
     public List<String> getAllowedUrls() {
         String allowedURL = prop.getProperty(TS_ALLOWED_URLS, DEFAULT_TS_ALLOWED_URLS);

diff --git a/ts/torch_handler/base_handler.py b/ts/torch_handler/base_handler.py
@@ -20,6 +20,7 @@
     load_label_mapping,
 )
 
+
 if packaging.version.parse(torch.__version__) >= packaging.version.parse("1.8.1"):
     from torch.profiler import ProfilerActivity, profile, record_function
 
@@ -69,7 +70,6 @@
 if os.environ.get("TS_IPEX_ENABLE", "false") == "true":
     try:
         import intel_extension_for_pytorch as ipex
-
         IPEX_AVAILABLE = True
     except ImportError as error:
         logger.warning(
@@ -79,7 +79,7 @@
 else:
     IPEX_AVAILABLE = False
 
-
+    
 try:
     import onnxruntime as ort
     import psutil
@@ -147,17 +147,22 @@ def initialize(self, context):
             RuntimeError: Raises the Runtime error when the model.py is missing
 
         """
-
+        
         if context is not None and hasattr(context, "model_yaml_config"):
             self.model_yaml_config = context.model_yaml_config
-
+        
         properties = context.system_properties
-
         if torch.cuda.is_available() and properties.get("gpu_id") is not None:
             self.map_location = "cuda"
             self.device = torch.device(
                 self.map_location + ":" + str(properties.get("gpu_id"))
             )
+        elif torch.xpu.is_available() and properties.get("gpu_id") is not None and os.environ.get("TS_IPEX_GPU_ENABLE", "false") == "true":
+            self.map_location = "xpu"
+            self.device = torch.device(
+                self.map_location + ":" + str(properties.get("gpu_id"))
+            )
+            torch.xpu.device(self.device)        
         elif torch.backends.mps.is_available() and properties.get("gpu_id") is not None:
             self.map_location = "mps"
             self.device = torch.device("mps")
@@ -273,6 +278,7 @@ def initialize(self, context):
 
         elif IPEX_AVAILABLE:
             self.model = self.model.to(memory_format=torch.channels_last)
+            self.model = self.model.to(self.device)
             self.model = ipex.optimize(self.model)
             logger.info(f"Compiled model with ipex")
 
@@ -560,4 +566,4 @@ def get_device(self):
         Returns:
             string : self device
         """
-        return self.device
+        return self.device