diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index e26c6e63ea07..25df0ba07d65 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -44,7 +44,8 @@ namespace storage { #if MXNET_USE_CUDA /*! - * \brief Storage manager with a memory pool on gpu. + * \brief Storage manager with a memory pool on gpu. Memory chunks are reused based on exact size + * match. */ class GPUPooledStorageManager final : public StorageManager { public: @@ -131,9 +132,9 @@ void GPUPooledStorageManager::Free(Storage::Handle handle) { } void GPUPooledStorageManager::ReleaseAll() { - Storage::Handle handle; for (auto&& i : memory_pool_) { for (auto&& j : i.second) { + Storage::Handle handle; handle.dptr = j; handle.size = i.first; DirectFreeNoLock(handle); @@ -144,6 +145,17 @@ void GPUPooledStorageManager::ReleaseAll() { /*! * \brief Storage manager with a memory pool, with rounded size, on gpu. + * + * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and + * nearest multiple (linear) rounding to help alleviate the memory allocation stress + * in which the default naive exact-size-match pool falls short, such as in variable-length + * input/output cases like RNN workloads. + * + * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set + * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB) + * and 34 (16 GB). + * Suppose the cutoff is X, the memory size buckets look like this: + * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ... */ class GPUPooledRoundedStorageManager final : public StorageManager { public: @@ -152,11 +164,28 @@ class GPUPooledRoundedStorageManager final : public StorageManager { */ GPUPooledRoundedStorageManager() { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); - min_chunk_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_LOG2_MIN_CHUNK", 5); - if (min_chunk_ < 5) { - LOG(FATAL) << "MXNET_GPU_MEM_POOL_LOG2_MIN_CHUNK cannot be set to a value smaller than 5. " \ - << "Got " << min_chunk_ << "."; + min_chunk_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_MIN_CHUNK", 4096); + cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24); + if (min_chunk_ < 32) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_MIN_CHUNK cannot be set to a value smaller than 32. " \ + << "Got: " << min_chunk_ << "."; + } + if (min_chunk_ != 1ul << log2_round_up(min_chunk_)) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_MIN_CHUNK must be a power of 2. Got: " << min_chunk_ << "."; + } else { + min_chunk_ = log2_round_up(min_chunk_); } + if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \ + << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \ + << cut_off_ << "."; + } + if (cut_off_ < min_chunk_) { + LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \ + << "smaller than log2 of MXNET_GPU_MEM_POOL_MIN_CHUNK. Got: " \ + << cut_off_ << " vs " << min_chunk_ << "."; + } + memory_pool_ = std::vector>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_); } /*! * \brief Default destructor. @@ -169,7 +198,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager { void Free(Storage::Handle handle) override; void DirectFree(Storage::Handle handle) override { - handle.size = 1ul << log2_round_up(handle.size); + handle.size = get_size(get_bucket(handle.size)); std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); DirectFreeNoLock(handle); } @@ -219,16 +248,43 @@ class GPUPooledRoundedStorageManager final : public StorageManager { #if defined(__clang__) || defined(__GNUC__) || defined(__WINDOWS__) inline int log2_round_up(size_t s) { - int fls = clz(s); // find last set - // must be bigger than min_chunk_ (which is at least 32 for nccl scatter) - return std::max(static_cast(min_chunk_), (addr_width-fls) + ((ctz(s) < fls - 1)?1:0)); + int result = addr_width - 1 - clz(s); + return result + ((ctz(s) < result)?1:0); + } + inline int div_pow2_round_up(size_t s, int divisor_log2) { + // (1025, 10) -> 2 + // (2048, 10) -> 2 + // (2049, 10) -> 3 + int ffs = ctz(s); // find first set + return (s >> divisor_log2) + (ffs < divisor_log2 ? 1 : 0); } #else inline int log2_round_up(size_t s) { - return std::max(static_cast(min_chunk_), - static_cast(std::ceil(std::log2(s)))); + return static_cast(std::ceil(std::log2(s))); + } + inline int div_pow2_round_up(size_t s, int divisor_log2) { + // (1025, 10) -> 2 + // (2048, 10) -> 2 + // (2049, 10) -> 3 + int divisor = std::pow(2, divisor_log2); + return s / divisor + (s % divisor ? 1 : 0); } #endif // defined(__clang__) || defined(__GNUC__) || defined(__WINDOWS__) + inline int get_bucket(size_t s) { + int log_size = log2_round_up(s); + if (log_size > static_cast(cut_off_)) + return div_pow2_round_up(s, cut_off_) - 1 + cut_off_; + else + return std::max(log_size, static_cast(min_chunk_)); + } + + inline size_t get_size(int bucket) { + if (bucket <= static_cast(cut_off_)) + return 1ul << bucket; + else + return (bucket - cut_off_ + 1) * (1ul << cut_off_); + } + void DirectFreeNoLock(Storage::Handle handle) { cudaError_t err = cudaFree(handle.dptr); // ignore unloading error, as memory has already been recycled @@ -242,20 +298,21 @@ class GPUPooledRoundedStorageManager final : public StorageManager { void ReleaseAll(); // number of devices const int NDEV = 32; + const size_t LOG2_MAX_MEM = 34; static const int addr_width = sizeof(size_t) * 8; // used memory - size_t used_memory_ = 0, min_chunk_; + size_t used_memory_ = 0, min_chunk_, cut_off_; // percentage of reserved memory int reserve_; // memory pool - std::array, addr_width> memory_pool_; + std::vector> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager); }; // class GPUPooledRoundedStorageManager void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) { - int log2_size = log2_round_up(handle->size); - size_t size = 1ul << log2_size; - auto&& reuse_pool = memory_pool_[log2_size]; + int bucket = get_bucket(handle->size); + size_t size = get_size(bucket); + auto&& reuse_pool = memory_pool_[bucket]; std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); if (reuse_pool.size() == 0) { size_t free, total; @@ -278,17 +335,18 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) { } void GPUPooledRoundedStorageManager::Free(Storage::Handle handle) { - int log2_size = log2_round_up(handle.size); - auto&& reuse_pool = memory_pool_[log2_size]; + int bucket = get_bucket(handle.size); + auto&& reuse_pool = memory_pool_[bucket]; std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); reuse_pool.push_back(handle.dptr); } void GPUPooledRoundedStorageManager::ReleaseAll() { - Storage::Handle handle; for (size_t i = 0; i < memory_pool_.size(); i++) { - handle.size = 1ul << i; + int size = get_size(i); for (auto& j : memory_pool_[i]) { + Storage::Handle handle; + handle.size = size; handle.dptr = j; DirectFreeNoLock(handle); }