Merge pull request BVLC#1046 from shelhamer/cudnn

cuDNN acceleration
mitmul · Sep 8, 2014 · ca33841 · ca33841
2 parents 657a61c + 5a25c94
commit ca33841
Show file tree

Hide file tree

Showing 30 changed files with 2,070 additions and 17 deletions.
diff --git a/Makefile b/Makefile
@@ -253,10 +253,17 @@ endif
 # Debugging
 ifeq ($(DEBUG), 1)
 	COMMON_FLAGS += -DDEBUG -g -O0
+	NVCCFLAGS += -G
 else
 	COMMON_FLAGS += -DNDEBUG -O2
 endif
 
+# cuDNN acceleration configuration.
+ifeq ($(USE_CUDNN), 1)
+	LIBRARIES += cudnn
+	COMMON_FLAGS += -DUSE_CUDNN
+endif
+
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
 	OBJS := $(PROTO_OBJS) $(CXX_OBJS)
@@ -299,7 +306,7 @@ LIBRARY_DIRS += $(BLAS_LIB)
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
-NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
+NVCCFLAGS += -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 # mex may invoke an older gcc that is too liberal with -Wuninitalized
 MATLAB_CXXFLAGS := $(CXXFLAGS) -Wno-uninitialized
 LINKFLAGS += -fPIC $(COMMON_FLAGS) $(WARNINGS)

diff --git a/Makefile.config.example b/Makefile.config.example
@@ -1,6 +1,9 @@
 ## Refer to http://caffe.berkeleyvision.org/installation.html
 # Contributions simplifying and improving our build system are welcome!
 
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+# USE_CUDNN := 1
+
 # CPU-only switch (uncomment to build without GPU support).
 # CPU_ONLY := 1
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -15,7 +15,7 @@ We have installed Caffe on Ubuntu 14.04, Ubuntu 12.04, OS X 10.9, and OS X 10.8.
 
 Caffe depends on several software packages.
 
-* [CUDA](https://developer.nvidia.com/cuda-zone) library version 6.0, 5.5, or 5.0 and the latest driver version for CUDA 6 or 319.* for CUDA 5 (and NOT 331.*)
+* [CUDA](https://developer.nvidia.com/cuda-zone) library version 6.5 (recommended), 6.0, 5.5, or 5.0 and the latest driver version for CUDA 6 or 319.* for CUDA 5 (and NOT 331.*)
 * [BLAS](http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) (provided via ATLAS, MKL, or OpenBLAS).
 * [OpenCV](http://opencv.org/).
 * [Boost](http://www.boost.org/) (>= 1.55, although only 1.55 is tested)
@@ -25,13 +25,17 @@ Caffe depends on several software packages.
 * For the MATLAB wrapper
     * MATLAB with the `mex` compiler.
 
-**CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic.
+
+**CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
 ### CUDA and BLAS
 
 Caffe requires the CUDA `nvcc` compiler to compile its GPU code and CUDA driver for GPU operation.
 To install CUDA, go to the [NVIDIA CUDA website](https://developer.nvidia.com/cuda-downloads) and follow installation instructions there. Install the library and the latest standalone driver separately; the driver bundled with the library is usually out-of-date. **Warning!** The 331.* CUDA driver series has a critical performance issue: do not use it.
 
+For best performance, Caffe can be accelerated by [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). Register for free at the cuDNN site, install it, then continue with these installation instructions. To compile with cuDNN set the `USE_CUDNN := 1` flag set in your `Makefile.config`.
+
 Caffe requires BLAS as the backend of its matrix and vector computations.
 There are several implementations of this library.
 The choice is yours:
@@ -92,7 +96,7 @@ Keep reading to find out how to manually build and install the Google flags libr
 On **CentOS / RHEL / Fedora**, most of the dependencies can be installed with
 
     sudo yum install protobuf-devel leveldb-devel snappy-devel opencv-devel boost-devel hdf5-devel
-    
+
 The Google flags library, Google logging library and LMDB already made their ways into newer versions of **CentOS / RHEL / Fedora** so it is better to first attempt to install them using `yum`
 
     sudo yum install gflags-devel glog-devel lmdb-devel
@@ -192,7 +196,7 @@ If you're not using Anaconda, include `hdf5` in the list above.
 **Note** that in order to build the caffe python wrappers you must install boost using the --with-python option:
 
     brew install --build-from-source --with-python --fresh -vd boost
-    
+
 **Note** that Homebrew maintains itself as a separate git repository and making the above `brew edit FORMULA` changes will change files in your local copy of homebrew's master branch. By default, this will prevent you from updating Homebrew using `brew update`, as you will get an error message like the following:
 
     $ brew update
@@ -201,7 +205,7 @@ If you're not using Anaconda, include `hdf5` in the list above.
     Please, commit your changes or stash them before you can merge.
     Aborting
     Error: Failure while executing: git pull -q origin refs/heads/master:refs/remotes/origin/master
-    
+
 One solution is to commit your changes to a separate Homebrew branch, run `brew update`, and rebase your changes onto the updated master, as follows:
 
     cd /usr/local
@@ -213,7 +217,7 @@ One solution is to commit your changes to a separate Homebrew branch, run `brew
     git rebase master caffe
     # Resolve any merge conflicts here
     git checkout caffe
-    
+
 At this point, you should be running the latest Homebrew packages and your Caffe-related modifications will remain in place. You may still get the following error:
 
     $ brew update
@@ -240,6 +244,8 @@ The defaults should work, but uncomment the relevant lines if using Anaconda Pyt
     make test
     make runtest
 
+To compile with cuDNN acceleration, you should uncomment the `USE_CUDNN := 1` switch in `Makefile.config`.
+
 If there is no GPU in your machine, you should switch to CPU-only Caffe by uncommenting `CPU_ONLY := 1` in `Makefile.config`.
 
 To compile the Python and MATLAB wrappers do `make pycaffe` and `make matcaffe` respectively.

diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
@@ -4,7 +4,7 @@ title: Performance and Hardware Configuration
 
 # Performance and Hardware Configuration
 
-To measure performance on different NVIDIA GPUs we use the Caffe reference ImageNet model.
+To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe reference ImageNet model.
 
 For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
 
@@ -14,11 +14,16 @@ For training, each time point is 20 iterations/minibatches of 256 images for 5,1
 
 Performance is best with ECC off and boost clock enabled. While ECC makes a negligible difference in speed, disabling it frees ~1 GB of GPU memory.
 
-Best settings with ECC off and maximum clock speed:
+Best settings with ECC off and maximum clock speed in standard Caffe:
 
 * Training is 26.5 secs / 20 iterations (5,120 images)
 * Testing is 100 secs / validation set (50,000 images)
 
+Best settings with Caffe + [cuDNN acceleration](http://nvidia.com/cudnn):
+
+* Training is 19.2 secs / 20 iterations (5,120 images)
+* Testing is 60.7 secs / validation set (50,000 images)
+
 Other settings:
 
 * ECC on, max speed: training 26.7 secs / 20 iterations, test 101 secs / validation set
@@ -50,12 +55,19 @@ but note that this configuration resets across driver reloading / rebooting. Inc
 Training: 26.26 secs / 20 iterations (5,120 images).
 Testing: 100 secs / validation set (50,000 images).
 
+cuDNN Training: 20.25 secs / 20 iterations (5,120 images).
+cuDNN Testing: 66.3 secs / validation set (50,000 images).
+
+
 ## NVIDIA K20
 
 Training: 36.0 secs / 20 iterations (5,120 images).
-Testing: 133 secs / validation set (50,000 images)
+Testing: 133 secs / validation set (50,000 images).
 
 ## NVIDIA GTX 770
 
 Training: 33.0 secs / 20 iterations (5,120 images).
-Testing: 129 secs / validation set (50,000 images)
+Testing: 129 secs / validation set (50,000 images).
+
+cuDNN Training: 24.3 secs / 20 iterations (5,120 images).
+cuDNN Testing: 104 secs / validation set (50,000 images).
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
@@ -375,6 +375,32 @@ class SoftmaxLayer : public Layer<Dtype> {
   Blob<Dtype> scale_;
 };
 
+#ifdef USE_CUDNN
+/**
+ * @brief cuDNN implementation of SoftmaxLayer.
+ *        Fallback to SoftmaxLayer for CPU mode.
+ */
+template <typename Dtype>
+class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
+ public:
+  explicit CuDNNSoftmaxLayer(const LayerParameter& param)
+      : SoftmaxLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual ~CuDNNSoftmaxLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  cudnnHandle_t             handle_;
+  cudnnTensor4dDescriptor_t bottom_desc_;
+  cudnnTensor4dDescriptor_t top_desc_;
+};
+#endif
+
 /**
  * @brief Creates a "split" path in the network by copying the bottom Blob
  *        into multiple top Blob%s to be used by multiple consuming layers.

diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
@@ -356,6 +356,31 @@ class ReLULayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
+#ifdef USE_CUDNN
+/**
+ * @brief CuDNN acceleration of ReLULayer.
+ */
+template <typename Dtype>
+class CuDNNReLULayer : public ReLULayer<Dtype> {
+ public:
+  explicit CuDNNReLULayer(const LayerParameter& param)
+      : ReLULayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual ~CuDNNReLULayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  cudnnHandle_t             handle_;
+  cudnnTensor4dDescriptor_t bottom_desc_;
+  cudnnTensor4dDescriptor_t top_desc_;
+};
+#endif
+
 /**
  * @brief Sigmoid function non-linearity @f$
  *         y = (1 + \exp(-x))^{-1}
@@ -413,6 +438,31 @@ class SigmoidLayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
+#ifdef USE_CUDNN
+/**
+ * @brief CuDNN acceleration of SigmoidLayer.
+ */
+template <typename Dtype>
+class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
+ public:
+  explicit CuDNNSigmoidLayer(const LayerParameter& param)
+      : SigmoidLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual ~CuDNNSigmoidLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  cudnnHandle_t             handle_;
+  cudnnTensor4dDescriptor_t bottom_desc_;
+  cudnnTensor4dDescriptor_t top_desc_;
+};
+#endif
+
 /**
  * @brief TanH hyperbolic tangent non-linearity @f$
  *         y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
@@ -472,6 +522,31 @@ class TanHLayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 };
 
+#ifdef USE_CUDNN
+/**
+ * @brief CuDNN acceleration of TanHLayer.
+ */
+template <typename Dtype>
+class CuDNNTanHLayer : public TanHLayer<Dtype> {
+ public:
+  explicit CuDNNTanHLayer(const LayerParameter& param)
+      : TanHLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual ~CuDNNTanHLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  cudnnHandle_t             handle_;
+  cudnnTensor4dDescriptor_t bottom_desc_;
+  cudnnTensor4dDescriptor_t top_desc_;
+};
+#endif
+
 /**
  * @brief Tests whether the input exceeds a threshold: outputs 1 for inputs
  *        above threshold; 0 otherwise.