Merge pull request BVLC#159 from drnikolaev/caffe-0.15-mem

GPU Memory Manager refactored
gheinrich · Jun 6, 2016 · 057ace8 · 057ace8
2 parents daa511d + 7ba7ad6
commit 057ace8
Show file tree

Hide file tree

Showing 13 changed files with 183 additions and 180 deletions.
diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp
@@ -64,7 +64,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   size_t *workspace_fwd_sizes_;
   size_t *workspace_bwd_data_sizes_;
   size_t *workspace_bwd_filter_sizes_;
-  GPUMemoryManager::Buffer workspace;
+  GPUMemory::Workspace workspace;
   int backward_passed_ctr_;
 };
 #endif

diff --git a/include/caffe/layers/cudnn_lcn_layer.hpp b/include/caffe/layers/cudnn_lcn_layer.hpp
@@ -41,7 +41,7 @@ class CuDNNLCNLayer : public LRNLayer<Dtype> {
   Dtype alpha_, beta_, k_;
 
   size_t tempDataSize_;
-  GPUMemoryManager::Buffer temp1_, temp2_;
+  GPUMemory::Workspace temp1_, temp2_;
 };
 #endif
 

diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
@@ -3,87 +3,82 @@
 
 #include <vector>
 #include "caffe/common.hpp"
+#ifndef CPU_ONLY
+
+namespace cub {
+  class CachingDeviceAllocator;
+}
 
 namespace caffe {
 
-class GPUMemoryManager {
- public:
-  enum PoolMode {
-    NO_POOL,  // Straight CUDA malloc/free (may be expensive)
-    CUB_POOL,  // CUB caching allocator
-#ifdef CPU_ONLY
-    DEFAULT_POOL = NO_POOL
-#else
-    DEFAULT_POOL = CUB_POOL  // CUB pool is able to use unified memory properly
-#endif
-  };
+struct GPUMemory {
+  static void GetInfo(size_t* free_mem, size_t* used_mem) {
+    return mgr_.GetInfo(free_mem, used_mem);
+  }
 
-  static const char* pool_name();
-  static bool using_pool() {
-    return mode_ != NO_POOL;
+  template <class Any>
+  static void allocate(Any** ptr, size_t size,
+      cudaStream_t stream = cudaStreamDefault) {
+    CHECK(try_allocate(reinterpret_cast<void**>(ptr), size, stream));
   }
 
-  class Arena {
-   public:
-    Arena(const std::vector<int>& gpus, PoolMode m = DEFAULT_POOL, bool debug =
-        false) {
-      init(gpus, m, debug);
-    }
-    ~Arena() {
-      destroy();
-    }
+  static void deallocate(void* ptr,
+      cudaStream_t stream = cudaStreamDefault) {
+    mgr_.deallocate(ptr, stream);
+  }
+
+  static bool try_allocate(void** ptr, size_t size,
+      cudaStream_t stream = cudaStreamDefault) {
+    return mgr_.try_allocate(ptr, size, stream);
+  }
+
+  enum Mode {
+    CUDA_MALLOC,   // Straight CUDA malloc/free (may be expensive)
+    CUB_ALLOCATOR  // CUB caching allocator
   };
 
-#ifndef CPU_ONLY
-  class Buffer {
-   public:
-    // Construction/destruction
-    Buffer() :
-        ptr_(NULL), stream_(), size_(0) {
+  // Scope initializes global Memory Manager for a given scope.
+  // It's instantiated in test(), train() and time() Caffe brewing functions
+  // as well as in unit tests main().
+  struct Scope {
+    Scope(const std::vector<int>& gpus, Mode m = CUB_ALLOCATOR,
+          bool debug = false) {
+      mgr_.init(gpus, m, debug);
     }
-    Buffer(size_t size, cudaStream_t s = cudaStreamDefault) :
-        stream_(s) {
+  };
+
+  // Workspace's release() functionality depends on global pool availability
+  // If pool is available, it returns memory to the pool and sets ptr to NULL
+  // If pool is not available, it retains memory.
+  struct Workspace {
+    Workspace() : ptr_(NULL), stream_(), size_(0) {}
+    Workspace(size_t size, cudaStream_t s = cudaStreamDefault) : stream_(s) {
       reserve(size);
     }
-    ~Buffer() {
-      GPUMemoryManager::deallocate(ptr_, stream_);
-    }
+    ~Workspace() { mgr_.deallocate(ptr_, stream_); }
 
-    // Accessors
-    void* data() const {
-      return ptr_;
-    }
-    size_t size() const {
-      return size_;
-    }
+    void* data() const { return ptr_; }
+    size_t size() const { return size_; }
 
-    // Memory allocation/release
     bool try_reserve(size_t size) {
       bool status = true;
       if (size > size_) {
         if (ptr_) {
-          GPUMemoryManager::deallocate(ptr_, stream_);
+          mgr_.deallocate(ptr_, stream_);
         }
-        status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
+        status = mgr_.try_allocate(&ptr_, size, stream_);
         if (status) {
           size_ = size;
         }
       }
       return status;
     }
 
-    void reserve(size_t size) {
-      CHECK(try_reserve(size));
-    }
+    void reserve(size_t size) { CHECK(try_reserve(size)); }
 
-    /*
-     * This method behaves differently depending on pool availability:
-     * If pool is available, it returns memory to the pool and sets ptr to NULL
-     * If pool is not available, it does nothing (retaining memory)
-     */
     void release() {
-      if (GPUMemoryManager::using_pool()) {
-        GPUMemoryManager::deallocate(ptr_, stream_);
+      if (mgr_.using_pool()) {
+        mgr_.deallocate(ptr_, stream_);
         ptr_ = NULL;
         size_ = 0;
       }
@@ -95,44 +90,46 @@ class GPUMemoryManager {
     cudaStream_t stream_;
     size_t size_;
   };
-  static void update_dev_info(int device);
-#endif  // CPU_ONLY
 
  private:
-  static void init(const std::vector<int>&, PoolMode, bool);
-  static void destroy();
+  struct Manager {
+    Manager();
+    ~Manager();
+    void GetInfo(size_t* free_mem, size_t* used_mem);
+    void deallocate(void* ptr, cudaStream_t stream);
+    bool try_allocate(void** ptr, size_t size, cudaStream_t);
+    const char* pool_name() const;
+    bool using_pool() const { return mode_ != CUDA_MALLOC; }
+    void init(const std::vector<int>&, Mode, bool);
+
+    Mode mode_;
+    bool debug_;
 
-  static bool initialized_;
-  static PoolMode mode_;
-  static bool debug_;
-
-#ifndef CPU_ONLY
-  struct MemInfo {
-    MemInfo() {
-      free_ = total_ = flush_count_ = 0;
-    }
-    size_t free_;
-    size_t total_;
-    unsigned flush_count_;
+   private:
+    struct DevInfo {
+      DevInfo() {
+        free_ = total_ = flush_count_ = 0;
+      }
+      size_t free_;
+      size_t total_;
+      unsigned flush_count_;
+    };
+    void update_dev_info(int device);
+    vector<DevInfo> dev_info_;
+    bool initialized_;
+    cub::CachingDeviceAllocator* cub_allocator_;
+
+    static unsigned int BIN_GROWTH;  ///< Geometric growth factor for bin-sizes
+    static unsigned int MIN_BIN;  ///< Minimum bin
+    static unsigned int MAX_BIN;  ///< Maximum bin
+    static size_t MAX_CACHED_BYTES;  ///< Maximum aggregate cached bytes
   };
-  static vector<MemInfo> dev_info_;
-
- public:
-  typedef void* pointer;
-  static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
-      cudaStreamDefault);
-  static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
-      cudaStreamDefault) {
-    CHECK(try_allocate(ptr, size, stream));
-  }
-  static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
-  static void GetInfo(size_t* free_mem, size_t* used_mem);
 
- private:
-  static void InitMemory(const std::vector<int>& gpus, PoolMode m);
-#endif
+  static Manager mgr_;
 };
 
 }  // namespace caffe
 
 #endif
+
+#endif
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
@@ -18,6 +18,7 @@
 #include "caffe/layers/memory_data_layer.hpp"
 #include "caffe/layers/python_layer.hpp"
 #include "caffe/sgd_solvers.hpp"
+#include "caffe/util/gpu_memory.hpp"
 
 // Temporary solution for numpy < 1.7 versions: old macro, no promises.
 // You're strongly advised to upgrade to >= 1.7.
@@ -51,9 +52,25 @@ namespace caffe {
 typedef float Dtype;
 const int NPY_DTYPE = NPY_FLOAT32;
 
+#ifndef CPU_ONLY
+shared_ptr<GPUMemory::Scope> gpu_memory_scope;
+#endif
+
 // Selecting mode.
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
-void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
+void set_mode_gpu() {
+  Caffe::set_mode(Caffe::GPU);
+#ifndef CPU_ONLY
+  vector<int> gpus;
+  int count = 0;
+  CUDA_CHECK(cudaGetDeviceCount(&count));
+  for (int i = 0; i < count; ++i) {
+    gpus.push_back(i);
+  }
+  CHECK_GT(gpus.size(), 0);
+  gpu_memory_scope.reset(new GPUMemory::Scope(gpus));
+#endif
+}
 
 // For convenience, check that input files can be opened, and raise an
 // exception that boost will send to Python if not (caffe could still crash

diff --git a/python/caffe/imagenet/ilsvrc_2012_mean.npy → ...affe/layers/imagenet/ilsvrc_2012_mean.npy b/python/caffe/imagenet/ilsvrc_2012_mean.npy → ...affe/layers/imagenet/ilsvrc_2012_mean.npy
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -98,7 +98,7 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
   // Specify workspace limit for kernels directly until we have a
   // planning strategy and a rewrite of Caffe's GPU memory mangagement
   size_t workspace_limit_bytes, total_memory;
-  GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
+  GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
 
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],

diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
@@ -22,7 +22,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
 
     // Test free space and force reshape if allocations have changed
     size_t workspace_limit_bytes, total_memory;
-    GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
+    GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
     if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
       this->Reshape(bottom, top);
     }
@@ -82,7 +82,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_diff = top[i]->gpu_diff();
     // Test free space and force reshape if allocations have changed
     size_t workspace_limit_bytes, total_memory;
-    GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
+    GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
     if (workspace_bwd_filter_sizes_[i] > workspace_limit_bytes ||
         workspace_bwd_data_sizes_[i] > workspace_limit_bytes ||
         // We need to get workspace sizes for the default algos at 1st run

diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
@@ -90,15 +90,15 @@ GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
   // Allocate device buffers
   CUDA_CHECK(cudaSetDevice(device));
   buffer_device_ = device;
-  GPUMemoryManager::allocate(reinterpret_cast<void **>(&data_),
+  GPUMemory::allocate(reinterpret_cast<void **>(&data_),
       size_ * sizeof(Dtype));
 
   // Copy blob values
   const vector<Blob<Dtype>*>& net =
       root_solver->net()->learnable_params();
   apply_buffers(net, data_, size_, copy);
 
-  GPUMemoryManager::allocate(reinterpret_cast<void **>(&diff_),
+  GPUMemory::allocate(reinterpret_cast<void **>(&diff_),
       size_ * sizeof(Dtype));
   caffe_gpu_set(size_, Dtype(0), diff_);
 
@@ -114,8 +114,8 @@ GPUParams<Dtype>::~GPUParams() {
   int initial_device;
   CUDA_CHECK(cudaGetDevice(&initial_device));
   CUDA_CHECK(cudaSetDevice(buffer_device_));
-  GPUMemoryManager::deallocate(data_);
-  GPUMemoryManager::deallocate(diff_);
+  GPUMemory::deallocate(data_);
+  GPUMemory::deallocate(diff_);
   data_ = NULL;
   diff_ = NULL;
   CUDA_CHECK(cudaSetDevice(initial_device));

diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
@@ -17,7 +17,7 @@ SyncedMemory::~SyncedMemory() {
     if (gpu_device_ != -1) {
       CUDA_CHECK(cudaSetDevice(gpu_device_));
     }
-    GPUMemoryManager::deallocate(gpu_ptr_);
+    GPUMemory::deallocate(gpu_ptr_);
     cudaSetDevice(initial_device);
   }
 #endif  // CPU_ONLY
@@ -54,15 +54,15 @@ inline void SyncedMemory::to_gpu() {
   switch (head_) {
   case UNINITIALIZED:
     CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    GPUMemoryManager::allocate(&gpu_ptr_, size_);
+    GPUMemory::allocate(&gpu_ptr_, size_);
     caffe_gpu_memset(size_, 0, gpu_ptr_);
     head_ = HEAD_AT_GPU;
     own_gpu_data_ = true;
     break;
   case HEAD_AT_CPU:
     if (gpu_ptr_ == NULL) {
       CUDA_CHECK(cudaGetDevice(&gpu_device_));
-      GPUMemoryManager::allocate(&gpu_ptr_, size_);
+      GPUMemory::allocate(&gpu_ptr_, size_);
       own_gpu_data_ = true;
     }
     caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
@@ -111,7 +111,7 @@ void SyncedMemory::set_gpu_data(void* data) {
     if (gpu_device_ != -1) {
       CUDA_CHECK(cudaSetDevice(gpu_device_));
     }
-    GPUMemoryManager::deallocate(gpu_ptr_);
+    GPUMemory::deallocate(gpu_ptr_);
     cudaSetDevice(initial_device);
   }
   gpu_ptr_ = data;
@@ -144,7 +144,7 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
   CHECK(head_ == HEAD_AT_CPU);
   if (gpu_ptr_ == NULL) {
     CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    GPUMemoryManager::allocate(&gpu_ptr_, size_);
+    GPUMemory::allocate(&gpu_ptr_, size_);
     own_gpu_data_ = true;
   }
   const cudaMemcpyKind put = cudaMemcpyHostToDevice;
@@ -155,4 +155,3 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
 #endif
 
 }  // namespace caffe
-
diff --git a/src/caffe/test/test_blob.cpp b/src/caffe/test/test_blob.cpp
@@ -35,12 +35,14 @@ TYPED_TEST(BlobSimpleTest, TestInitialization) {
   EXPECT_EQ(this->blob_->count(), 0);
 }
 
+#ifndef CPU_ONLY
 TYPED_TEST(BlobSimpleTest, TestPointersCPUGPU) {
   EXPECT_TRUE(this->blob_preshaped_->gpu_data());
   EXPECT_TRUE(this->blob_preshaped_->cpu_data());
   EXPECT_TRUE(this->blob_preshaped_->mutable_gpu_data());
   EXPECT_TRUE(this->blob_preshaped_->mutable_cpu_data());
 }
+#endif
 
 TYPED_TEST(BlobSimpleTest, TestReshape) {
   this->blob_->Reshape(2, 3, 4, 5);

diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
@@ -50,7 +50,7 @@ int main(int argc, char** argv) {
   cout << "Current device id: " << device << endl;
   cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
   cout << "Current device name: " << CAFFE_TEST_CUDA_PROP.name << endl;
-  caffe::GPUMemoryManager::Arena arena(devices);
+  caffe::GPUMemory::Scope gpu_memory_scope(devices);
 
 #endif
   // invoke the test.