[Auto Parallel] Compatible new comm library upgrade for XPUs.

PaddlePaddle · Apr 24, 2024 · 6e9ad97 · 6e9ad97
1 parent 0359685
commit 6e9ad97
Show file tree

Hide file tree

Showing 9 changed files with 186 additions and 7 deletions.
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1813,6 +1813,26 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
   }
 }
 #endif
+#elif defined(PADDLE_WITH_XPU)
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place, XPUStream stream) {
+  AllocatorFacadePrivate* m = GetPrivate();
+
+  // The XPU currently does not have the concept of MallocAsyncAllocatorUsed
+  // and shares the logic of IsStreamSafeCUDAAllocatorUsed.
+  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
+    VLOG(6) << "Warning: StreamSafeCUDAAllocator "
+               "are not used!";
+    return GetAllocator(place);
+  }
+
+  if (platform::is_xpu_place(place) && FLAGS_use_system_allocator == false) {
+    return m->GetAllocator(place,
+                           stream,
+                           /*create_if_not_found=*/true);
+  }
+  return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE

diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -93,6 +93,9 @@ class AllocatorFacade {
       const platform::Place& place, gpuStream_t stream);
   gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) const;
   void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream);
+#elif defined(PADDLE_WITH_XPU)
+  TEST_API const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place, XPUStream stream);
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
@@ -61,6 +61,8 @@ limitations under the License. */
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
 #endif
 
 COMMON_DECLARE_int32(paddle_num_threads);
@@ -476,6 +478,34 @@ void InitMemoryMethod() {
     memory_method->get_new_cuda_event = [](int device_id) {
       return paddle::platform::CudaEventResourcePool::Instance().New(device_id);
     };
+#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
+    // TODO(ZibinGuo): Use phi methods later.
+    memory_method->get_allocator =
+        [](int device_id, XPUStream stream) -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::XPUPlace(device_id), stream)
+          .get();
+    };
+    memory_method->get_host_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get();
+    };
+    memory_method->get_zero_allocator = [](int device_id) -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::XPUPlace(device_id))
+          .get();
+    };
+    memory_method->get_host_zero_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::CPUPlace())
+          .get();
+    };
+    // XPUs do not have the concept of pinned memory,
+    // so the get_pinned_allocator function is not set.
+    memory_method->get_new_xpu_event = [](int device_id) {
+      return paddle::platform::XpuEventResourcePool::Instance().New(device_id);
+    };
 #endif
 
     memory_method->emplace_device_contexts =

diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
@@ -116,6 +116,29 @@ std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
     int device_id) {
   return MemoryUtils::Instance().GetCudaEvent(device_id);
 }
+#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
+const phi::Allocator* GetAllocator(int device_id, XPUStream stream) {
+  return MemoryUtils::Instance().GetAllocator(device_id, stream);
+}
+
+const phi::Allocator* GetHostAllocator() {
+  return MemoryUtils::Instance().GetHostAllocator();
+}
+
+const phi::Allocator* GetZeroAllocator(int device_id) {
+  return MemoryUtils::Instance().GetZeroAllocator(device_id);
+}
+
+const phi::Allocator* GetHostZeroAllocator() {
+  return MemoryUtils::Instance().GetHostZeroAllocator();
+}
+
+// XPUs do not have the concept of pinned memory,
+// so the get_pinned_allocator function is not set.
+std::shared_ptr<std::remove_pointer<XPUEvent>::type> GetXpuEvent(
+    int device_id) {
+  return MemoryUtils::Instance().GetXpuEvent(device_id);
+}
 #endif
 
 }  // namespace memory_utils

diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
@@ -170,6 +170,14 @@ struct MemoryInterface {
   phi::Allocator* (*get_pinned_allocator)();
   std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> (
       *get_new_cuda_event)(int device_id);
+#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
+  phi::Allocator* (*get_allocator)(int device_id, XPUStream stream);
+  phi::Allocator* (*get_host_allocator)();
+  phi::Allocator* (*get_zero_allocator)(int device_id);
+  phi::Allocator* (*get_host_zero_allocator)();
+  // phi::Allocator* (*get_pinned_allocator)();
+  std::shared_ptr<std::remove_pointer<XPUEvent>::type> (*get_new_xpu_event)(
+      int device_id);
 #endif
 };
 
@@ -370,6 +378,27 @@ class MemoryUtils {
       int device_id) {
     return memory_method_->get_new_cuda_event(device_id);
   }
+#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
+  const phi::Allocator* GetAllocator(int device_id, XPUStream stream) {
+    return memory_method_->get_allocator(device_id, stream);
+  }
+
+  const phi::Allocator* GetHostAllocator() {
+    return memory_method_->get_host_allocator();
+  }
+
+  const phi::Allocator* GetZeroAllocator(int device_id) {
+    return memory_method_->get_zero_allocator(device_id);
+  }
+
+  const phi::Allocator* GetHostZeroAllocator() {
+    return memory_method_->get_host_zero_allocator();
+  }
+
+  std::shared_ptr<std::remove_pointer<XPUEvent>::type> GetXpuEvent(
+      int device_id) {
+    return memory_method_->get_new_xpu_event(device_id);
+  }
 #endif
 
  private:
@@ -448,6 +477,18 @@ const Allocator* GetPinnedAllocator();
 
 std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
     int device_id);
+#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
+const Allocator* GetAllocator(int device_id, XPUStream stream);
+
+const Allocator* GetHostAllocator();
+
+const Allocator* GetZeroAllocator(int device_id);
+
+const Allocator* GetHostZeroAllocator();
+
+// XPUs do not have the concept of pinned memory,
+// so the get_pinned_allocator function is not set.
+std::shared_ptr<std::remove_pointer<XPUEvent>::type> GetXpuEvent(int device_id);
 #endif
 
 class Buffer {

diff --git a/paddle/phi/core/distributed/bkcl_comm_context.cc b/paddle/phi/core/distributed/bkcl_comm_context.cc
@@ -42,6 +42,20 @@ void BKCLCommContext::SetDevContext(
   dev_ctx_ = std::move(dev_ctx);
 }
 
+XPUEvent BKCLCommContext::GetComputeEvent() { return compute_event_.get(); }
+
+void BKCLCommContext::SetComputeEvent(
+    std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& compute_event) {
+  compute_event_ = std::move(compute_event);
+}
+
+XPUEvent BKCLCommContext::GetCommEvent() { return comm_event_.get(); }
+
+void BKCLCommContext::SetCommEvent(
+    std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& comm_event) {
+  comm_event_ = std::move(comm_event);
+}
+
 void BKCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
                                 const phi::DenseTensor& in_tensor,
                                 int root,

diff --git a/paddle/phi/core/distributed/bkcl_comm_context.h b/paddle/phi/core/distributed/bkcl_comm_context.h
@@ -30,6 +30,16 @@ class BKCLCommContext final : public CommContext {
 
   XPUStream GetStream();
 
+  XPUEvent GetComputeEvent();
+
+  void SetComputeEvent(
+      std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& compute_event);
+
+  XPUEvent GetCommEvent();
+
+  void SetCommEvent(
+      std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& comm_event);
+
   phi::XPUContext* GetDevContext();
 
   void SetDevContext(std::unique_ptr<phi::XPUContext>&& dev_ctx);
@@ -79,6 +89,12 @@ class BKCLCommContext final : public CommContext {
   BKCLContext_t bkcl_comm_;
 
   std::unique_ptr<phi::XPUContext> dev_ctx_;
+
+  // used for comm wait compute, compute_stream-->event-->comm_stream
+  std::shared_ptr<std::remove_pointer<XPUEvent>::type> compute_event_;
+
+  // used for compute wait comm, comm_stream-->event-->compute_stream
+  std::shared_ptr<std::remove_pointer<XPUEvent>::type> comm_event_;
 };
 
 }  // namespace distributed

diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -34,14 +34,14 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/distributed/nccl_tools.h"
+#elif defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/distributed/bkcl_comm_context.h"
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/core/distributed/xccl_comm_context.h"
 #endif
-#ifdef PADDLE_WITH_XPU_BKCL
-#include "paddle/phi/backends/xpu/xpu_info.h"
-#include "paddle/phi/core/distributed/bkcl_comm_context.h"
-#endif
 
 namespace phi {
 namespace distributed {
@@ -52,6 +52,9 @@ void CommContextManager::SetDeviceId(int dev_id) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   phi::backends::gpu::SetDeviceId(dev_id);
   CommContextManager::device_id = dev_id;
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  phi::backends::xpu::SetXPUDeviceId(dev_id);
+  CommContextManager::device_id = dev_id;
 #endif
 }
 
@@ -207,6 +210,29 @@ void CommContextManager::CreateBKCLCommContext(
   auto bkcl_comm_context =
       std::make_unique<BKCLCommContext>(rank, size, bkcl_id);
 
+  if (CommContextManager::device_id != -1) {
+    std::unique_ptr<phi::XPUContext> dev_ctx(
+        new phi::XPUContext(phi::XPUPlace(CommContextManager::device_id)));
+    dev_ctx->SetAllocator(phi::memory_utils::GetAllocator(
+        CommContextManager::device_id, dev_ctx->stream()));
+    dev_ctx->SetHostAllocator(phi::memory_utils::GetHostAllocator());
+    dev_ctx->SetZeroAllocator(
+        phi::memory_utils::GetZeroAllocator(CommContextManager::device_id));
+    dev_ctx->SetHostZeroAllocator(phi::memory_utils::GetHostZeroAllocator());
+    // XPUs do not have the concept of pinned memory,
+    // so the get_pinned_allocator function is not set.
+
+    // It currently does not support dev_ctx->PartialInitWithAllocator().
+    auto compute_event =
+        phi::memory_utils::GetXpuEvent(CommContextManager::device_id);
+    auto comm_event =
+        phi::memory_utils::GetXpuEvent(CommContextManager::device_id);
+
+    bkcl_comm_context->SetDevContext(std::move(dev_ctx));
+    bkcl_comm_context->SetComputeEvent(std::move(compute_event));
+    bkcl_comm_context->SetCommEvent(std::move(comm_event));
+  }
+
   comm_context_manager.SetStore(store);
   comm_context_manager.Emplace(unique_comm_key, std::move(bkcl_comm_context));
 }

diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
@@ -101,12 +101,18 @@ def parse_device(self):
             )
             if visible_devices_str in os.environ:
                 visible_devices = os.getenv(visible_devices_str)
-        elif 'CUDA_VISIBLE_DEVICES' in os.environ:
-            dev._dtype = DeviceType.GPU
-            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+        elif 'XPULINK_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.XPU
+            visible_devices = os.getenv("XPULINK_VISIBLE_DEVICES")
         elif 'XPU_VISIBLE_DEVICES' in os.environ:
             dev._dtype = DeviceType.XPU
             visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif 'CUDA_VISIBLE_DEVICES' in os.environ:
+            if core.is_compiled_with_xpu():
+                dev._dtype = DeviceType.XPU
+            else:
+                dev._dtype = DeviceType.GPU
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
 
         if visible_devices is not None and visible_devices != 'all':
             dev._labels = visible_devices.split(',')