Skip to content

Commit

Permalink
Merge pull request BVLC#159 from drnikolaev/caffe-0.15-mem
Browse files Browse the repository at this point in the history
GPU Memory Manager refactored
  • Loading branch information
drnikolaev committed Jun 6, 2016
2 parents daa511d + 7ba7ad6 commit 057ace8
Show file tree
Hide file tree
Showing 13 changed files with 183 additions and 180 deletions.
2 changes: 1 addition & 1 deletion include/caffe/layers/cudnn_conv_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
size_t *workspace_fwd_sizes_;
size_t *workspace_bwd_data_sizes_;
size_t *workspace_bwd_filter_sizes_;
GPUMemoryManager::Buffer workspace;
GPUMemory::Workspace workspace;
int backward_passed_ctr_;
};
#endif
Expand Down
2 changes: 1 addition & 1 deletion include/caffe/layers/cudnn_lcn_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class CuDNNLCNLayer : public LRNLayer<Dtype> {
Dtype alpha_, beta_, k_;

size_t tempDataSize_;
GPUMemoryManager::Buffer temp1_, temp2_;
GPUMemory::Workspace temp1_, temp2_;
};
#endif

Expand Down
167 changes: 82 additions & 85 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,87 +3,82 @@

#include <vector>
#include "caffe/common.hpp"
#ifndef CPU_ONLY

namespace cub {
class CachingDeviceAllocator;
}

namespace caffe {

class GPUMemoryManager {
public:
enum PoolMode {
NO_POOL, // Straight CUDA malloc/free (may be expensive)
CUB_POOL, // CUB caching allocator
#ifdef CPU_ONLY
DEFAULT_POOL = NO_POOL
#else
DEFAULT_POOL = CUB_POOL // CUB pool is able to use unified memory properly
#endif
};
struct GPUMemory {
static void GetInfo(size_t* free_mem, size_t* used_mem) {
return mgr_.GetInfo(free_mem, used_mem);
}

static const char* pool_name();
static bool using_pool() {
return mode_ != NO_POOL;
template <class Any>
static void allocate(Any** ptr, size_t size,
cudaStream_t stream = cudaStreamDefault) {
CHECK(try_allocate(reinterpret_cast<void**>(ptr), size, stream));
}

class Arena {
public:
Arena(const std::vector<int>& gpus, PoolMode m = DEFAULT_POOL, bool debug =
false) {
init(gpus, m, debug);
}
~Arena() {
destroy();
}
static void deallocate(void* ptr,
cudaStream_t stream = cudaStreamDefault) {
mgr_.deallocate(ptr, stream);
}

static bool try_allocate(void** ptr, size_t size,
cudaStream_t stream = cudaStreamDefault) {
return mgr_.try_allocate(ptr, size, stream);
}

enum Mode {
CUDA_MALLOC, // Straight CUDA malloc/free (may be expensive)
CUB_ALLOCATOR // CUB caching allocator
};

#ifndef CPU_ONLY
class Buffer {
public:
// Construction/destruction
Buffer() :
ptr_(NULL), stream_(), size_(0) {
// Scope initializes global Memory Manager for a given scope.
// It's instantiated in test(), train() and time() Caffe brewing functions
// as well as in unit tests main().
struct Scope {
Scope(const std::vector<int>& gpus, Mode m = CUB_ALLOCATOR,
bool debug = false) {
mgr_.init(gpus, m, debug);
}
Buffer(size_t size, cudaStream_t s = cudaStreamDefault) :
stream_(s) {
};

// Workspace's release() functionality depends on global pool availability
// If pool is available, it returns memory to the pool and sets ptr to NULL
// If pool is not available, it retains memory.
struct Workspace {
Workspace() : ptr_(NULL), stream_(), size_(0) {}
Workspace(size_t size, cudaStream_t s = cudaStreamDefault) : stream_(s) {
reserve(size);
}
~Buffer() {
GPUMemoryManager::deallocate(ptr_, stream_);
}
~Workspace() { mgr_.deallocate(ptr_, stream_); }

// Accessors
void* data() const {
return ptr_;
}
size_t size() const {
return size_;
}
void* data() const { return ptr_; }
size_t size() const { return size_; }

// Memory allocation/release
bool try_reserve(size_t size) {
bool status = true;
if (size > size_) {
if (ptr_) {
GPUMemoryManager::deallocate(ptr_, stream_);
mgr_.deallocate(ptr_, stream_);
}
status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
status = mgr_.try_allocate(&ptr_, size, stream_);
if (status) {
size_ = size;
}
}
return status;
}

void reserve(size_t size) {
CHECK(try_reserve(size));
}
void reserve(size_t size) { CHECK(try_reserve(size)); }

/*
* This method behaves differently depending on pool availability:
* If pool is available, it returns memory to the pool and sets ptr to NULL
* If pool is not available, it does nothing (retaining memory)
*/
void release() {
if (GPUMemoryManager::using_pool()) {
GPUMemoryManager::deallocate(ptr_, stream_);
if (mgr_.using_pool()) {
mgr_.deallocate(ptr_, stream_);
ptr_ = NULL;
size_ = 0;
}
Expand All @@ -95,44 +90,46 @@ class GPUMemoryManager {
cudaStream_t stream_;
size_t size_;
};
static void update_dev_info(int device);
#endif // CPU_ONLY

private:
static void init(const std::vector<int>&, PoolMode, bool);
static void destroy();
struct Manager {
Manager();
~Manager();
void GetInfo(size_t* free_mem, size_t* used_mem);
void deallocate(void* ptr, cudaStream_t stream);
bool try_allocate(void** ptr, size_t size, cudaStream_t);
const char* pool_name() const;
bool using_pool() const { return mode_ != CUDA_MALLOC; }
void init(const std::vector<int>&, Mode, bool);

Mode mode_;
bool debug_;

static bool initialized_;
static PoolMode mode_;
static bool debug_;

#ifndef CPU_ONLY
struct MemInfo {
MemInfo() {
free_ = total_ = flush_count_ = 0;
}
size_t free_;
size_t total_;
unsigned flush_count_;
private:
struct DevInfo {
DevInfo() {
free_ = total_ = flush_count_ = 0;
}
size_t free_;
size_t total_;
unsigned flush_count_;
};
void update_dev_info(int device);
vector<DevInfo> dev_info_;
bool initialized_;
cub::CachingDeviceAllocator* cub_allocator_;

static unsigned int BIN_GROWTH; ///< Geometric growth factor for bin-sizes
static unsigned int MIN_BIN; ///< Minimum bin
static unsigned int MAX_BIN; ///< Maximum bin
static size_t MAX_CACHED_BYTES; ///< Maximum aggregate cached bytes
};
static vector<MemInfo> dev_info_;

public:
typedef void* pointer;
static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
cudaStreamDefault);
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
cudaStreamDefault) {
CHECK(try_allocate(ptr, size, stream));
}
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
static void GetInfo(size_t* free_mem, size_t* used_mem);

private:
static void InitMemory(const std::vector<int>& gpus, PoolMode m);
#endif
static Manager mgr_;
};

} // namespace caffe

#endif

#endif
19 changes: 18 additions & 1 deletion python/caffe/_caffe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "caffe/layers/memory_data_layer.hpp"
#include "caffe/layers/python_layer.hpp"
#include "caffe/sgd_solvers.hpp"
#include "caffe/util/gpu_memory.hpp"

// Temporary solution for numpy < 1.7 versions: old macro, no promises.
// You're strongly advised to upgrade to >= 1.7.
Expand Down Expand Up @@ -51,9 +52,25 @@ namespace caffe {
typedef float Dtype;
const int NPY_DTYPE = NPY_FLOAT32;

#ifndef CPU_ONLY
shared_ptr<GPUMemory::Scope> gpu_memory_scope;
#endif

// Selecting mode.
void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
void set_mode_gpu() {
Caffe::set_mode(Caffe::GPU);
#ifndef CPU_ONLY
vector<int> gpus;
int count = 0;
CUDA_CHECK(cudaGetDeviceCount(&count));
for (int i = 0; i < count; ++i) {
gpus.push_back(i);
}
CHECK_GT(gpus.size(), 0);
gpu_memory_scope.reset(new GPUMemory::Scope(gpus));
#endif
}

// For convenience, check that input files can be opened, and raise an
// exception that boost will send to Python if not (caffe could still crash
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/caffe/layers/cudnn_conv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
// Specify workspace limit for kernels directly until we have a
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes, total_memory;
GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);

for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
Expand Down
4 changes: 2 additions & 2 deletions src/caffe/layers/cudnn_conv_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(

// Test free space and force reshape if allocations have changed
size_t workspace_limit_bytes, total_memory;
GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
this->Reshape(bottom, top);
}
Expand Down Expand Up @@ -82,7 +82,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const Dtype* top_diff = top[i]->gpu_diff();
// Test free space and force reshape if allocations have changed
size_t workspace_limit_bytes, total_memory;
GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
if (workspace_bwd_filter_sizes_[i] > workspace_limit_bytes ||
workspace_bwd_data_sizes_[i] > workspace_limit_bytes ||
// We need to get workspace sizes for the default algos at 1st run
Expand Down
8 changes: 4 additions & 4 deletions src/caffe/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,15 @@ GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
// Allocate device buffers
CUDA_CHECK(cudaSetDevice(device));
buffer_device_ = device;
GPUMemoryManager::allocate(reinterpret_cast<void **>(&data_),
GPUMemory::allocate(reinterpret_cast<void **>(&data_),
size_ * sizeof(Dtype));

// Copy blob values
const vector<Blob<Dtype>*>& net =
root_solver->net()->learnable_params();
apply_buffers(net, data_, size_, copy);

GPUMemoryManager::allocate(reinterpret_cast<void **>(&diff_),
GPUMemory::allocate(reinterpret_cast<void **>(&diff_),
size_ * sizeof(Dtype));
caffe_gpu_set(size_, Dtype(0), diff_);

Expand All @@ -114,8 +114,8 @@ GPUParams<Dtype>::~GPUParams() {
int initial_device;
CUDA_CHECK(cudaGetDevice(&initial_device));
CUDA_CHECK(cudaSetDevice(buffer_device_));
GPUMemoryManager::deallocate(data_);
GPUMemoryManager::deallocate(diff_);
GPUMemory::deallocate(data_);
GPUMemory::deallocate(diff_);
data_ = NULL;
diff_ = NULL;
CUDA_CHECK(cudaSetDevice(initial_device));
Expand Down
11 changes: 5 additions & 6 deletions src/caffe/syncedmem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ SyncedMemory::~SyncedMemory() {
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
GPUMemoryManager::deallocate(gpu_ptr_);
GPUMemory::deallocate(gpu_ptr_);
cudaSetDevice(initial_device);
}
#endif // CPU_ONLY
Expand Down Expand Up @@ -54,15 +54,15 @@ inline void SyncedMemory::to_gpu() {
switch (head_) {
case UNINITIALIZED:
CUDA_CHECK(cudaGetDevice(&gpu_device_));
GPUMemoryManager::allocate(&gpu_ptr_, size_);
GPUMemory::allocate(&gpu_ptr_, size_);
caffe_gpu_memset(size_, 0, gpu_ptr_);
head_ = HEAD_AT_GPU;
own_gpu_data_ = true;
break;
case HEAD_AT_CPU:
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
GPUMemoryManager::allocate(&gpu_ptr_, size_);
GPUMemory::allocate(&gpu_ptr_, size_);
own_gpu_data_ = true;
}
caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
Expand Down Expand Up @@ -111,7 +111,7 @@ void SyncedMemory::set_gpu_data(void* data) {
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
GPUMemoryManager::deallocate(gpu_ptr_);
GPUMemory::deallocate(gpu_ptr_);
cudaSetDevice(initial_device);
}
gpu_ptr_ = data;
Expand Down Expand Up @@ -144,7 +144,7 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
CHECK(head_ == HEAD_AT_CPU);
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
GPUMemoryManager::allocate(&gpu_ptr_, size_);
GPUMemory::allocate(&gpu_ptr_, size_);
own_gpu_data_ = true;
}
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
Expand All @@ -155,4 +155,3 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
#endif

} // namespace caffe

2 changes: 2 additions & 0 deletions src/caffe/test/test_blob.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ TYPED_TEST(BlobSimpleTest, TestInitialization) {
EXPECT_EQ(this->blob_->count(), 0);
}

#ifndef CPU_ONLY
TYPED_TEST(BlobSimpleTest, TestPointersCPUGPU) {
EXPECT_TRUE(this->blob_preshaped_->gpu_data());
EXPECT_TRUE(this->blob_preshaped_->cpu_data());
EXPECT_TRUE(this->blob_preshaped_->mutable_gpu_data());
EXPECT_TRUE(this->blob_preshaped_->mutable_cpu_data());
}
#endif

TYPED_TEST(BlobSimpleTest, TestReshape) {
this->blob_->Reshape(2, 3, 4, 5);
Expand Down
2 changes: 1 addition & 1 deletion src/caffe/test/test_caffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ int main(int argc, char** argv) {
cout << "Current device id: " << device << endl;
cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
cout << "Current device name: " << CAFFE_TEST_CUDA_PROP.name << endl;
caffe::GPUMemoryManager::Arena arena(devices);
caffe::GPUMemory::Scope gpu_memory_scope(devices);

#endif
// invoke the test.
Expand Down
Loading

0 comments on commit 057ace8

Please sign in to comment.