feat: GPU ORT (#371)

MaaXYZ · Oct 1, 2024 · e00f5f8 · e00f5f8
1 parent ef707e4
commit e00f5f8
Show file tree

Hide file tree

Showing 16 changed files with 210 additions and 13 deletions.
diff --git a/include/MaaFramework/MaaDef.h b/include/MaaFramework/MaaDef.h
@@ -111,6 +111,12 @@ typedef MaaOption MaaResOption;
 enum MaaResOptionEnum
 {
     MaaResOption_Invalid = 0,
+
+    /// Use the specified inference device, the default is INT32_MAX, which means CPU.
+    /// Please set this option before loading the model.
+    ///
+    /// value: int32_t, eg: 0; val_size: sizeof(int32_t)
+    MaaResOption_GpuId = 1,
 };
 
 typedef MaaOption MaaCtrlOption;

diff --git a/source/MaaFramework/Resource/OCRResMgr.cpp b/source/MaaFramework/Resource/OCRResMgr.cpp
@@ -15,6 +15,22 @@ OCRResMgr::OCRResMgr()
     option_.UseOrtBackend();
 }
 
+bool OCRResMgr::use_cpu()
+{
+    LogInfo;
+
+    option_.UseCpu();
+    return true;
+}
+
+bool OCRResMgr::use_gpu(int device_id)
+{
+    LogInfo << VAR(device_id);
+
+    option_.UseGpu(device_id);
+    return true;
+}
+
 bool OCRResMgr::lazy_load(const std::filesystem::path& path, bool is_base)
 {
     LogFunc << VAR(path) << VAR(is_base);

diff --git a/source/MaaFramework/Resource/OCRResMgr.h b/source/MaaFramework/Resource/OCRResMgr.h
@@ -19,6 +19,9 @@ class OCRResMgr : public NonCopyable
 {
 public:
     OCRResMgr();
+
+    bool use_cpu();
+    bool use_gpu(int device_id);
     bool lazy_load(const std::filesystem::path& path, bool is_base);
     void clear();
 

diff --git a/source/MaaFramework/Resource/ONNXResMgr.cpp b/source/MaaFramework/Resource/ONNXResMgr.cpp
@@ -2,12 +2,109 @@
 
 #include <filesystem>
 #include <ranges>
+#include <unordered_set>
+
+#ifdef _WIN32
+#include "Utils/SafeWindows.hpp"
+#endif
+
+#if __has_include(<onnxruntime/dml_provider_factory.h>)
+#define MAA_WITH_DML
+#include <onnxruntime/dml_provider_factory.h>
+#endif
+
+#if __has_include(<onnxruntime/coreml_provider_factory.h>)
+#define MAA_WITH_COREML
+#include <onnxruntime/coreml_provider_factory.h>
+#endif
 
 #include "Utils/Logger.h"
 #include "Utils/Platform.h"
 
 MAA_RES_NS_BEGIN
 
+ONNXResMgr::~ONNXResMgr()
+{
+    if (gpu_device_id_) {
+        LogWarn << "GPU is enabled, leaking resources";
+
+        // FIXME: intentionally leak ort objects to avoid crash (double free?)
+        // https://github.com/microsoft/onnxruntime/issues/15174
+        for (auto& session : classifiers_ | std::views::values) {
+            auto leak_session = new Ort::Session(nullptr);
+            *leak_session = std::move(*session);
+        }
+        for (auto& session : detectors_ | std::views::values) {
+            auto leak_session = new Ort::Session(nullptr);
+            *leak_session = std::move(*session);
+        }
+
+        auto leak_options = new Ort::SessionOptions(nullptr);
+        *leak_options = std::move(options_);
+    }
+}
+
+bool ONNXResMgr::use_cpu()
+{
+    LogInfo;
+
+    options_ = {};
+    gpu_device_id_ = std::nullopt;
+    return true;
+}
+
+bool ONNXResMgr::use_gpu(int device_id)
+{
+    LogInfo << VAR(device_id);
+
+    if (gpu_device_id_ && *gpu_device_id_ == device_id) {
+        LogWarn << "GPU is already enabled";
+        return true;
+    }
+    options_ = {};
+
+    auto all_providers_vec = Ort::GetAvailableProviders();
+    std::unordered_set<std::string> all_providers(
+        std::make_move_iterator(all_providers_vec.begin()),
+        std::make_move_iterator(all_providers_vec.end()));
+    LogInfo << VAR(all_providers);
+
+    if (all_providers.contains("CUDAExecutionProvider")) {
+        OrtCUDAProviderOptions cuda_options {};
+        cuda_options.device_id = device_id;
+        options_.AppendExecutionProvider_CUDA(cuda_options);
+
+        LogInfo << "Using CUDA execution provider with device_id " << device_id;
+    }
+#ifdef MAA_WITH_DML
+    else if (all_providers.contains("DmlExecutionProvider")) {
+        auto status = OrtSessionOptionsAppendExecutionProvider_DML(options_, device_id);
+        if (!Ort::Status(status).IsOK()) {
+            LogError << "Failed to append DML execution provider with device_id " << device_id;
+            return false;
+        }
+        LogInfo << "Using DML execution provider with device_id " << device_id;
+    }
+#endif
+#ifdef MAA_WITH_COREML
+    else if (all_providers.contains("CoreMLExecutionProvider")) {
+        auto status = OrtSessionOptionsAppendExecutionProvider_CoreML((OrtSessionOptions*)options_, 0);
+        if (!Ort::Status(status).IsOK()) {
+            LogError << "Failed to append CoreML execution provider";
+            return false;
+        }
+        LogInfo << "Using CoreML execution provider";
+    }
+#endif
+    else {
+        LogError << "No supported execution provider found";
+        return false;
+    }
+
+    gpu_device_id_ = device_id;
+    return true;
+}
+
 bool ONNXResMgr::lazy_load(const std::filesystem::path& path, bool is_base)
 {
     LogFunc << VAR(path) << VAR(is_base);
@@ -71,7 +168,7 @@ std::shared_ptr<Ort::Session> ONNXResMgr::load(const std::string& name, const st
         }
 
         LogTrace << VAR(path);
-        Ort::Session session(m_env, path.c_str(), m_options);
+        Ort::Session session(env_, path.c_str(), options_);
         return std::make_shared<Ort::Session>(std::move(session));
     }
 

diff --git a/source/MaaFramework/Resource/ONNXResMgr.h b/source/MaaFramework/Resource/ONNXResMgr.h
@@ -2,6 +2,7 @@
 
 #include <filesystem>
 #include <memory>
+#include <optional>
 
 #include <onnxruntime/onnxruntime_cxx_api.h>
 
@@ -17,7 +18,12 @@ class ONNXResMgr : public NonCopyable
     inline static const std::filesystem::path kClassifierDir = "classify";
     inline static const std::filesystem::path kDetectorDir = "detect";
 
+    ~ONNXResMgr();
+
 public:
+    bool use_cpu();
+    bool use_gpu(int device_id);
+
     bool lazy_load(const std::filesystem::path& path, bool is_base);
     void clear();
 
@@ -31,8 +37,9 @@ class ONNXResMgr : public NonCopyable
     std::vector<std::filesystem::path> classifier_roots_;
     std::vector<std::filesystem::path> detector_roots_;
 
-    Ort::Env m_env;
-    Ort::SessionOptions m_options;
+    Ort::Env env_;
+    Ort::SessionOptions options_;
+    std::optional<int> gpu_device_id_;
 
     mutable std::unordered_map<std::string, std::shared_ptr<Ort::Session>> classifiers_;
     mutable std::unordered_map<std::string, std::shared_ptr<Ort::Session>> detectors_;

diff --git a/source/MaaFramework/Resource/ResourceMgr.cpp b/source/MaaFramework/Resource/ResourceMgr.cpp
@@ -28,11 +28,16 @@ ResourceMgr::~ResourceMgr()
 
 bool ResourceMgr::set_option(MaaResOption key, MaaOptionValue value, MaaOptionValueSize val_size)
 {
-    std::ignore = key;
-    std::ignore = value;
-    std::ignore = val_size;
+    LogFunc << VAR(key) << VAR_VOIDP(value) << VAR(val_size);
 
-    return false;
+    switch (key) {
+    case MaaResOption_GpuId:
+        return set_gpu_id(value, val_size);
+
+    default:
+        LogError << "Unknown key" << VAR(key) << VAR(value);
+        return false;
+    }
 }
 
 MaaResId ResourceMgr::post_path(const std::filesystem::path& path)
@@ -243,6 +248,30 @@ CustomActionSession ResourceMgr::custom_action(const std::string& name) const
     return it->second;
 }
 
+bool ResourceMgr::set_gpu_id(MaaOptionValue value, MaaOptionValueSize val_size)
+{
+    LogFunc << VAR_VOIDP(value) << VAR(val_size);
+
+    if (val_size != sizeof(int32_t)) {
+        LogError << "invalid size" << VAR(val_size);
+        return false;
+    }
+
+    int32_t gpu_id = *reinterpret_cast<int*>(value);
+    LogInfo << VAR(gpu_id);
+
+    if (gpu_id == INT32_MAX) {
+        onnx_res_.use_cpu();
+        ocr_res_.use_cpu();
+    }
+    else {
+        onnx_res_.use_gpu(gpu_id);
+        ocr_res_.use_gpu(gpu_id);
+    }
+
+    return true;
+}
+
 bool ResourceMgr::run_load(typename AsyncRunner<std::filesystem::path>::Id id, std::filesystem::path path)
 {
     LogFunc << VAR(id) << VAR(path);

diff --git a/source/MaaFramework/Resource/ResourceMgr.h b/source/MaaFramework/Resource/ResourceMgr.h
@@ -78,6 +78,8 @@ class ResourceMgr : public MaaResource
     CustomActionSession custom_action(const std::string& name) const;
 
 private:
+    bool set_gpu_id(MaaOptionValue value, MaaOptionValueSize val_size);
+
     bool run_load(typename AsyncRunner<std::filesystem::path>::Id id, std::filesystem::path path);
     bool load(const std::filesystem::path& path);
     bool check_stop();

diff --git a/source/MaaFramework/Vision/NeuralNetworkClassifier.cpp b/source/MaaFramework/Vision/NeuralNetworkClassifier.cpp
@@ -59,8 +59,6 @@ NeuralNetworkClassifier::Result NeuralNetworkClassifier::classify() const
     cv::Size input_image_size(static_cast<int>(input_shape[3]), static_cast<int>(input_shape[2]));
     cv::resize(image, image, input_image_size, 0, 0, cv::INTER_AREA);
     std::vector<float> input = image_to_tensor(image);
-
-    // TODO: GPU
     auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
 
     Ort::Value input_tensor =

diff --git a/source/MaaFramework/Vision/NeuralNetworkDetector.cpp b/source/MaaFramework/Vision/NeuralNetworkDetector.cpp
@@ -62,8 +62,6 @@ NeuralNetworkDetector::ResultsVec NeuralNetworkDetector::detect() const
     cv::Size input_image_size(static_cast<int>(input_shape[3]), static_cast<int>(input_shape[2]));
     cv::resize(image, image, input_image_size, 0, 0, cv::INTER_AREA);
     std::vector<float> input = image_to_tensor(image);
-
-    // TODO: GPU
     auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
 
     Ort::Value input_tensor =

diff --git a/source/MaaFramework/Vision/OCRer.cpp b/source/MaaFramework/Vision/OCRer.cpp
@@ -85,7 +85,7 @@ OCRer::ResultsVec OCRer::predict_det_and_rec(const cv::Mat& image_roi) const
     fastdeploy::vision::OCRResult ocr_result;
     bool ret = ocrer_->Predict(image_roi, &ocr_result);
     if (!ret) {
-        LogWarn << "inferencer return false" << VAR(ocrer_) << VAR(image_) << VAR(image_roi);
+        LogWarn << "predict return false" << VAR(ocrer_) << VAR(image_) << VAR(image_roi);
         return {};
     }
 

diff --git a/source/MaaProjectInterface/Impl/Configurator.cpp b/source/MaaProjectInterface/Impl/Configurator.cpp
@@ -120,6 +120,8 @@ std::optional<RuntimeParam> Configurator::generate_runtime() const
     }
     }
 
+    runtime.gpu = config_.gpu;
+
     return runtime;
 }
 

diff --git a/source/MaaProjectInterface/Impl/Runner.cpp b/source/MaaProjectInterface/Impl/Runner.cpp
@@ -44,6 +44,7 @@ bool Runner::run(
     }
 
     auto resource_handle = MaaResourceCreate(notify, notify_trans_arg);
+    MaaResourceSetOption(resource_handle, MaaResOption_GpuId, const_cast<int32_t*>(&param.gpu), sizeof(int32_t));
 
     MaaId cid = MaaControllerPostConnection(controller_handle);
     MaaId rid = 0;

diff --git a/source/binding/Python/maa/define.py b/source/binding/Python/maa/define.py
@@ -49,6 +49,7 @@ class MaaStatusEnum(IntEnum):
 MaaOption = ctypes.c_int32
 MaaGlobalOption = MaaOption
 MaaCtrlOption = MaaOption
+MaaResOption = MaaOption
 
 
 class MaaGlobalOptionEnum:
@@ -104,6 +105,15 @@ class MaaCtrlOptionEnum:
     Recording = 5
 
 
+class MaaResOptionEnum:
+    Invalid = 0
+
+    # Use the specified inference device, the default is INT32_MAX, which means CPU.
+    # Please set this option before loading the model.
+    # value: int32_t, eg: 0; val_size: sizeof(int32_t)
+    GpuId = 1
+
+
 MaaAdbScreencapMethod = ctypes.c_uint64
 
 

diff --git a/source/binding/Python/maa/resource.py b/source/binding/Python/maa/resource.py
@@ -60,6 +60,21 @@ def loaded(self) -> bool:
     def clear(self) -> bool:
         return bool(Library.framework.MaaResourceClear(self._handle))
 
+    def set_gpu(self, device_id: int) -> bool:
+        cint = ctypes.c_int32(device_id)
+        return bool(
+            Library.framework.MaaResourceSetOption(
+                self._handle,
+                MaaResOptionEnum.GpuId,
+                ctypes.pointer(cint),
+                ctypes.sizeof(ctypes.c_int32),
+            )
+        )
+
+    def set_cpu(self) -> bool:
+        INT32_MAX = 2147483647  # means CPU
+        return self.set_gpu(INT32_MAX)
+
     def register_custom_recognition(
         self, name: str, recognition: "CustomRecognition"  # type: ignore
     ) -> bool:
@@ -189,6 +204,14 @@ def _set_api_properties():
             MaaStringBufferHandle,
         ]
 
+        Library.framework.MaaResourceSetOption.restype = MaaBool
+        Library.framework.MaaResourceSetOption.argtypes = [
+            MaaResourceHandle,
+            MaaResOption,
+            MaaOptionValue,
+            MaaOptionValueSize,
+        ]
+
         Library.framework.MaaResourceRegisterCustomRecognition.restype = MaaBool
         Library.framework.MaaResourceRegisterCustomRecognition.argtypes = [
             MaaResourceHandle,

diff --git a/source/include/ProjectInterface/Types.h b/source/include/ProjectInterface/Types.h
@@ -150,8 +150,9 @@ struct Configuration
     Win32Config win32;
     std::string resource;
     std::vector<Task> task;
+    int32_t gpu = INT32_MAX;
 
-    MEO_JSONIZATION(controller, MEO_OPT adb, MEO_OPT win32, resource, task);
+    MEO_JSONIZATION(controller, MEO_OPT adb, MEO_OPT win32, resource, task, MEO_OPT gpu);
 };
 
 struct RuntimeParam
@@ -184,6 +185,7 @@ struct RuntimeParam
     std::vector<std::string> resource_path;
 
     std::vector<Task> task;
+    int32_t gpu = INT32_MAX;
 };
 
 struct CustomRecognitionSession

diff --git a/test/python/binding_test.py b/test/python/binding_test.py
@@ -108,7 +108,10 @@ def run(
 
 def api_test():
     r1 = Resource()
+    r1.set_gpu(0)
+    r1.set_gpu(1)
     r2 = Resource()
+    r2.set_cpu()
     r2.post_path("C:/_maafw_testing_/aaabbbccc").wait()
     t1 = Tasker()
     t2 = Tasker()