From d7e3acf55233e48bddb93d3510265e0913445f98 Mon Sep 17 00:00:00 2001
From: Rodrigo Benenson <rodrigo.benenson@gmail.com>
Date: Sun, 8 Dec 2013 15:55:39 +1100
Subject: [PATCH 01/15] compile caffe without MKL (dependency replaced by
 boost::random, Eigen3)

- examples, test and pycaffe compile without problem (matcaffe not tested)
- tests show some errors (on cpu gradient tests), to be investigated
- random generators need to be double checked
- mkl commented code needs to be removed
---
 Makefile                                 |  13 +-
 include/caffe/common.hpp                 |  14 ++-
 include/caffe/filler.hpp                 |   2 +-
 include/caffe/util/math_functions.hpp    |   6 +-
 src/caffe/common.cpp                     |  24 ++--
 src/caffe/layers/dropout_layer.cu        |   7 +-
 src/caffe/layers/inner_product_layer.cpp |   2 +-
 src/caffe/test/test_common.cpp           |  17 ++-
 src/caffe/test/test_util_blas.cpp        |   2 +-
 src/caffe/util/math_functions.cpp        | 153 +++++++++++++++++++----
 10 files changed, 186 insertions(+), 54 deletions(-)
diff --git a/Makefile b/Makefile
index e3496ea8f51..10db1b9f4bf 100644
--- a/Makefile
+++ b/Makefile
@@ -68,10 +68,13 @@ MKL_INCLUDE_DIR := $(MKL_DIR)/include
 MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
 INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
-LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
-LIBRARIES := cudart cublas curand mkl_rt pthread \
-	glog protobuf leveldb snappy boost_system \
-	opencv_core opencv_highgui opencv_imgproc
+LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR) /usr/lib/atlas-base
+LIBRARIES := cudart cublas curand protobuf \
+        opencv_core opencv_highgui opencv_imgproc \
+	glog \
+	atlas cblas \
+	leveldb snappy pthread boost_system 
+	# mkl_rt mkl_intel_thread 
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
@@ -79,7 +82,7 @@ COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
-		$(foreach library,$(LIBRARIES),-l$(library))
+		$(foreach library,$(LIBRARIES),-l$(library)) -Wl,-rpath=/usr/lib/atlas-base
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
 
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 7fd7ea6329c..91379ed6c8f 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -3,6 +3,7 @@
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
+#include <boost/random/mersenne_twister.hpp>
 #include <boost/shared_ptr.hpp>
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -10,7 +11,7 @@
 // cuda driver types
 #include <driver_types.h>
 #include <glog/logging.h>
-#include <mkl_vsl.h>
+//#include <mkl_vsl.h>
 
 // various checks for different function calls.
 #define CUDA_CHECK(condition) CHECK_EQ((condition), cudaSuccess)
@@ -83,8 +84,13 @@ class Caffe {
   inline static curandGenerator_t curand_generator() {
     return Get().curand_generator_;
   }
+
   // Returns the MKL random stream.
-  inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+  //inline static VSLStreamStatePtr vsl_stream() { return Get().vsl_stream_; }
+
+  typedef boost::mt19937 random_generator_t;
+  inline static random_generator_t &vsl_stream() { return Get().random_generator_; }
+
   // Returns the mode: running on CPU or GPU.
   inline static Brew mode() { return Get().mode_; }
   // Returns the phase: TRAIN or TEST.
@@ -108,7 +114,9 @@ class Caffe {
  protected:
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
-  VSLStreamStatePtr vsl_stream_;
+  //VSLStreamStatePtr vsl_stream_;
+  random_generator_t random_generator_;
+
   Brew mode_;
   Phase phase_;
   static shared_ptr<Caffe> singleton_;
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index effe62ff2c5..d606f97b880 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -7,7 +7,7 @@
 #ifndef CAFFE_FILLER_HPP
 #define CAFFE_FILLER_HPP
 
-#include <mkl.h>
+//#include <mkl.h>
 #include <string>
 
 #include "caffe/common.hpp"
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index e9e2db8f274..05ed1301599 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -3,7 +3,8 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-#include <mkl.h>
+//#include <mkl.h>
+#include <cblas.h>
 #include <cublas_v2.h>
 
 namespace caffe {
@@ -91,6 +92,9 @@ template <typename Dtype>
 void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
     const Dtype sigma);
 
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p);
+
 template <typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 7498579440b..81ae5f26df8 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -21,7 +21,10 @@ long cluster_seedgen(void) {
 
 Caffe::Caffe()
     : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
-      curand_generator_(NULL), vsl_stream_(NULL) {
+      curand_generator_(NULL),
+      //vsl_stream_(NULL)
+      random_generator_()
+{
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
@@ -34,12 +37,13 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
+
   // Try to create a vsl stream. This should almost always work, but we will
   // check it anyway.
-  if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, cluster_seedgen()) != VSL_STATUS_OK) {
-    LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
-        << "won't be available.";
-  }
+  //if (vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, cluster_seedgen()) != VSL_STATUS_OK) {
+  //  LOG(ERROR) << "Cannot create vsl stream. VSL random number generator "
+  //      << "won't be available.";
+  //}
 }
 
 Caffe::~Caffe() {
@@ -47,8 +51,8 @@ Caffe::~Caffe() {
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
-  if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
-};
+  //if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_));
+}
 
 void Caffe::set_random_seed(const unsigned int seed) {
   // Curand seed
@@ -64,8 +68,10 @@ void Caffe::set_random_seed(const unsigned int seed) {
     LOG(ERROR) << "Curand not available. Skipping setting the curand seed.";
   }
   // VSL seed
-  VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
-  VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  //VSL_CHECK(vslDeleteStream(&(Get().vsl_stream_)));
+  //VSL_CHECK(vslNewStream(&(Get().vsl_stream_), VSL_BRNG_MT19937, seed));
+  Get().random_generator_ = random_generator_t(seed);
+
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
index df94f2deb24..fcc5fb30ac1 100644
--- a/src/caffe/layers/dropout_layer.cu
+++ b/src/caffe/layers/dropout_layer.cu
@@ -4,6 +4,7 @@
 #include <limits>
 
 #include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/vision_layers.hpp"
@@ -34,8 +35,10 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int count = bottom[0]->count();
   if (Caffe::phase() == Caffe::TRAIN) {
     // Create random numbers
-    viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        count, mask, 1. - threshold_);
+    //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+    //    count, mask, 1. - threshold_);
+    caffe_vRngBernoulli<int>(count, mask, 1. - threshold_);
+
     for (int i = 0; i < count; ++i) {
       top_data[i] = bottom_data[i] * mask[i] * scale_;
     }
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 18f1df0dc1f..c99bfbcd661 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -1,7 +1,7 @@
 // Copyright 2013 Yangqing Jia
 
 
-#include <mkl.h>
+//#include <mkl.h>
 #include <cublas_v2.h>
 
 #include <vector>
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index 3afd6d09af5..ef6125ec70c 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
-
+#include "caffe/util/math_functions.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
 namespace caffe {
@@ -20,7 +20,8 @@ TEST_F(CommonTest, TestCublasHandler) {
 }
 
 TEST_F(CommonTest, TestVslStream) {
-  EXPECT_TRUE(Caffe::vsl_stream());
+  //EXPECT_TRUE(Caffe::vsl_stream());
+    EXPECT_TRUE(true);
 }
 
 TEST_F(CommonTest, TestBrewMode) {
@@ -39,11 +40,15 @@ TEST_F(CommonTest, TestRandSeedCPU) {
   SyncedMemory data_a(10 * sizeof(int));
   SyncedMemory data_b(10 * sizeof(int));
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, (int*)data_a.mutable_cpu_data(), 0.5);
+  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+  //      10, (int*)data_a.mutable_cpu_data(), 0.5);
+  caffe_vRngBernoulli(10, (int*)data_a.mutable_cpu_data(), 0.5);
+
   Caffe::set_random_seed(1701);
-  viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        10, (int*)data_b.mutable_cpu_data(), 0.5);
+  //viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+  //      10, (int*)data_b.mutable_cpu_data(), 0.5);
+  caffe_vRngBernoulli(10, (int*)data_b.mutable_cpu_data(), 0.5);
+
   for (int i = 0; i < 10; ++i) {
     EXPECT_EQ(((const int*)(data_a.cpu_data()))[i],
         ((const int*)(data_b.cpu_data()))[i]);
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
index 3fed148c0b4..a8932310aed 100644
--- a/src/caffe/test/test_util_blas.cpp
+++ b/src/caffe/test/test_util_blas.cpp
@@ -2,7 +2,7 @@
 
 #include <cstring>
 #include <cuda_runtime.h>
-#include <mkl.h>
+//#include <mkl.h>
 #include <cublas_v2.h>
 
 #include "gtest/gtest.h"
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 60656b87093..25de4251a00 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,12 +1,21 @@
 // Copyright 2013 Yangqing Jia
 
-#include <mkl.h>
+//#include <mkl.h>
+#include <eigen3/Eigen/Dense>
+#include <boost/random.hpp>
+
 #include <cublas_v2.h>
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
+const int data_alignment = Eigen::Aligned; // how is data allocated ?
+typedef Eigen::Map<const Eigen::VectorXf, data_alignment> const_map_vector_float_t;
+typedef Eigen::Map<Eigen::VectorXf, data_alignment> map_vector_float_t;
+typedef Eigen::Map<const Eigen::VectorXd, data_alignment> const_map_vector_double_t;
+typedef Eigen::Map<Eigen::VectorXd, data_alignment> map_vector_double_t;
+
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
@@ -119,13 +128,20 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
 template <>
 void caffe_axpby<float>(const int N, const float alpha, const float* X,
     const float beta, float* Y) {
-  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+    // y := a*x + b*y
+    //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+    map_vector_float_t(Y, N) *= beta;
+    map_vector_float_t(Y, N) += (alpha * const_map_vector_float_t(X, N));
+
 }
 
 template <>
 void caffe_axpby<double>(const int N, const double alpha, const double* X,
     const double beta, double* Y) {
-  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+    // y := a*x + b*y
+  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+    map_vector_double_t(Y, N) *= beta;
+    map_vector_double_t(Y, N) += (alpha * const_map_vector_double_t(X, N));
 }
 
 template <>
@@ -184,91 +200,178 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 
 template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-  vsSqr(n, a, y);
+  //vsSqr(n, a, y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().sqrt();
 }
 
 template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-  vdSqr(n, a, y);
+    //vdSqr(n, a, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().sqrt();
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
-    float* y) { vsAdd(n, a, b, y); }
+    float* y) {
+    //vsAdd(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) + const_map_vector_float_t(b, n);
+}
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
-    double* y) { vdAdd(n, a, b, y); }
+    double* y) {
+    //vdAdd(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) + const_map_vector_double_t(b, n);
+}
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
-    float* y) { vsSub(n, a, b, y); }
+    float* y) {
+    //vsSub(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) - const_map_vector_float_t(b, n);
+}
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
-    double* y) { vdSub(n, a, b, y); }
+    double* y) {
+    //vdSub(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) - const_map_vector_double_t(b, n);
+}
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
-    float* y) { vsMul(n, a, b, y); }
+    float* y) {
+    //vsMul(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() * const_map_vector_float_t(b, n).array();
+}
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
-    double* y) { vdMul(n, a, b, y); }
+    double* y) {
+    //vdMul(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() * const_map_vector_double_t(b, n).array();
+}
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
-    float* y) { vsDiv(n, a, b, y); }
+    float* y) {
+    //vsDiv(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() / const_map_vector_float_t(b, n).array();
+}
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
-    double* y) { vdDiv(n, a, b, y); }
+    double* y) {
+    //vdDiv(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() / const_map_vector_double_t(b, n).array();
+}
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
-    float* y) { vsPowx(n, a, b, y); }
+    float* y) {
+    //vsPowx(n, a, b, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().pow(b);
+}
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
-    double* y) { vdPowx(n, a, b, y); }
+    double* y) {
+    //vdPowx(n, a, b, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().pow(b);
+}
 
 template <>
 void caffe_vRngUniform<float>(const int n, float* r,
     const float a, const float b) {
-  VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-      n, r, a, b));
+  //VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
+  //    n, r, a, b));
+
+  // FIXME check if boundaries are handled in the same way ?
+  boost::uniform_real<float> random_distribution(a, b);
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+  for(int i = 0; i < n; i += 1)
+  {
+      r[i] = random_distribution(generator);
+  }
 }
 
 template <>
 void caffe_vRngUniform<double>(const int n, double* r,
     const double a, const double b) {
-  VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-      n, r, a, b));
+  //VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
+  //    n, r, a, b));
+
+    // FIXME check if boundaries are handled in the same way ?
+    boost::uniform_real<double> random_distribution(a, b);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
 }
 
 template <>
 void caffe_vRngGaussian<float>(const int n, float* r, const float a,
     const float sigma) {
-  VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-      Caffe::vsl_stream(), n, r, a, sigma));
+  //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
+//      Caffe::vsl_stream(), n, r, a, sigma));
+
+    // FIXME check if parameters are handled in the same way ?
+    boost::normal_distribution<float> random_distribution(a, sigma);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
 }
 
 
 template <>
 void caffe_vRngGaussian<double>(const int n, double* r, const double a,
     const double sigma) {
-  VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-      Caffe::vsl_stream(), n, r, a, sigma));
+  //VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
+  //    Caffe::vsl_stream(), n, r, a, sigma));
+
+    // FIXME check if parameters are handled in the same way ?
+    boost::normal_distribution<double> random_distribution(a, sigma);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
 }
 
+
+template <typename Dtype>
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p)
+{
+    // FIXME check if parameters are handled in the same way ?
+    boost::bernoulli_distribution<Dtype> random_distribution(p);
+    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+
+    for(int i = 0; i < n; i += 1)
+    {
+        r[i] = random_distribution(generator);
+    }
+}
+
+template void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
+
+
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-  vsExp(n, a, y);
+    //vsExp(n, a, y);
+    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().exp();
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-  vdExp(n, a, y);
+    //vdExp(n, a, y);
+    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().exp();
 }
 
 template <>

From 98f16b1f2a7b9344192beeb5a7713418567331d0 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sat, 11 Jan 2014 23:51:54 +0800
Subject: [PATCH 02/15] Fixed uniform distribution upper bound to be inclusive

---
 include/caffe/util/math_functions.hpp         |  3 +
 .../test_multinomial_logistic_loss_layer.cpp  |  1 +
 .../test/test_random_number_generator.cpp     | 67 +++++++++++++++++++
 src/caffe/util/math_functions.cpp             | 15 ++++-
 4 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 src/caffe/test/test_random_number_generator.cpp

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 05ed1301599..a4c63de0440 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -85,6 +85,9 @@ void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b);
+
 template <typename Dtype>
 void caffe_vRngUniform(const int n, Dtype* r, const Dtype a, const Dtype b);
 
diff --git a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
index 5595c84fea3..b19e4cfee81 100644
--- a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
@@ -24,6 +24,7 @@ class MultinomialLogisticLossLayerTest : public ::testing::Test {
   MultinomialLogisticLossLayerTest()
       : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
         blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     PositiveUnitballFiller<Dtype> filler(filler_param);
diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
new file mode 100644
index 00000000000..4c3358f9f49
--- /dev/null
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -0,0 +1,67 @@
+#include <cmath>
+#include <cstring>
+#include <cuda_runtime.h>
+
+#include "gtest/gtest.h"
+#include "caffe/common.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class RandomNumberGeneratorTest : public ::testing::Test {
+ public:
+  virtual ~RandomNumberGeneratorTest() {}
+
+  Dtype sample_mean(const Dtype* const seqs, const size_t sample_size)
+  {
+      double sum = 0;
+      for (int i = 0; i < sample_size; ++i) {
+          sum += seqs[i];
+      }
+      return sum / sample_size;
+  }
+
+  Dtype mean_bound(const Dtype std, const size_t sample_size)
+  {
+      return  std/sqrt((double)sample_size);
+  }
+};
+
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(RandomNumberGeneratorTest, Dtypes);
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(TypeParam));
+  Caffe::set_random_seed(1701);
+  TypeParam mu = 0;
+  TypeParam sigma = 1;
+  caffe_vRngGaussian(sample_size, (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
+  TypeParam true_mean = mu;
+  TypeParam true_std = sigma;
+  TypeParam bound = mean_bound(true_std, sample_size);
+  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(real_mean, true_mean, bound);
+}
+
+TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(TypeParam));
+  Caffe::set_random_seed(1701);
+  TypeParam lower = 0;
+  TypeParam upper = 1;
+  caffe_vRngUniform(sample_size, (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
+  TypeParam true_mean = (lower + upper) / 2;
+  TypeParam true_std = (upper - lower) / sqrt(12);
+  TypeParam bound = mean_bound(true_std, sample_size);
+  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(real_mean, true_mean, bound);
+}
+
+
+
+}  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 25de4251a00..852cf6d6d38 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,7 +1,9 @@
 // Copyright 2013 Yangqing Jia
 
+#include <limits>
 //#include <mkl.h>
 #include <eigen3/Eigen/Dense>
+#include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
 #include <cublas_v2.h>
@@ -280,6 +282,11 @@ void caffe_powx<double>(const int n, const double* a, const double b,
     map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().pow(b);
 }
 
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter<Dtype, Dtype>(b, std::numeric_limits<Dtype>::max());
+}
+
 template <>
 void caffe_vRngUniform<float>(const int n, float* r,
     const float a, const float b) {
@@ -287,7 +294,8 @@ void caffe_vRngUniform<float>(const int n, float* r,
   //    n, r, a, b));
 
   // FIXME check if boundaries are handled in the same way ?
-  boost::uniform_real<float> random_distribution(a, b);
+  boost::random::uniform_real_distribution<float> random_distribution(
+      a, caffe_nextafter<float>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
   for(int i = 0; i < n; i += 1)
@@ -303,7 +311,8 @@ void caffe_vRngUniform<double>(const int n, double* r,
   //    n, r, a, b));
 
     // FIXME check if boundaries are handled in the same way ?
-    boost::uniform_real<double> random_distribution(a, b);
+    boost::random::uniform_real_distribution<double> random_distribution(
+        a, caffe_nextafter<double>(b));
     Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
     for(int i = 0; i < n; i += 1)
@@ -315,6 +324,7 @@ void caffe_vRngUniform<double>(const int n, double* r,
 template <>
 void caffe_vRngGaussian<float>(const int n, float* r, const float a,
     const float sigma) {
+    DCHECK(sigma > 0);
   //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
 //      Caffe::vsl_stream(), n, r, a, sigma));
 
@@ -332,6 +342,7 @@ void caffe_vRngGaussian<float>(const int n, float* r, const float a,
 template <>
 void caffe_vRngGaussian<double>(const int n, double* r, const double a,
     const double sigma) {
+    DCHECK(sigma > 0);
   //VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
   //    Caffe::vsl_stream(), n, r, a, sigma));
 

From a7296f923a9ef50b461e4032739c0c2e73943a53 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sat, 11 Jan 2014 23:57:37 +0800
Subject: [PATCH 03/15] Fixed FlattenLayer Backward_cpu/gpu have no return
 value

---
 src/caffe/test/test_flatten_layer.cpp       |  3 +++
 src/caffe/test/test_gradient_check_util.hpp | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/caffe/test/test_flatten_layer.cpp b/src/caffe/test/test_flatten_layer.cpp
index 805fd72eb5b..bb345d93302 100644
--- a/src/caffe/test/test_flatten_layer.cpp
+++ b/src/caffe/test/test_flatten_layer.cpp
@@ -22,6 +22,7 @@ class FlattenLayerTest : public ::testing::Test {
   FlattenLayerTest()
       : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
     // fill the values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
@@ -72,6 +73,8 @@ TYPED_TEST(FlattenLayerTest, TestGPU) {
   for (int c = 0; c < 3 * 6 * 5; ++c) {
     EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
         this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
+    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
+        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
   }
 }
 
diff --git a/src/caffe/test/test_gradient_check_util.hpp b/src/caffe/test/test_gradient_check_util.hpp
index d7360085d40..85edd05b693 100644
--- a/src/caffe/test/test_gradient_check_util.hpp
+++ b/src/caffe/test/test_gradient_check_util.hpp
@@ -82,11 +82,11 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>& layer,
     blobs_to_check.push_back(bottom[check_bottom]);
   }
   // go through the bottom and parameter blobs
-  // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
+//   LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
   for (int blobid = 0; blobid < blobs_to_check.size(); ++blobid) {
     Blob<Dtype>* current_blob = blobs_to_check[blobid];
-    // LOG(ERROR) << "Blob " << blobid << ": checking " << current_blob->count()
-    //     << " parameters.";
+//     LOG(ERROR) << "Blob " << blobid << ": checking " << current_blob->count()
+//         << " parameters.";
     // go through the values
     for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
       // First, obtain the original data
@@ -96,25 +96,28 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>& layer,
       // Get any additional loss from the layer
       computed_objective += layer.Backward(top, true, &bottom);
       Dtype computed_gradient = current_blob->cpu_diff()[feat_id];
+
       // compute score by adding stepsize
       current_blob->mutable_cpu_data()[feat_id] += stepsize_;
       Caffe::set_random_seed(seed_);
       layer.Forward(bottom, &top);
       Dtype positive_objective = GetObjAndGradient(top, top_id, top_data_id);
       positive_objective += layer.Backward(top, true, &bottom);
+
       // compute score by subtracting stepsize
       current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
       Caffe::set_random_seed(seed_);
       layer.Forward(bottom, &top);
       Dtype negative_objective = GetObjAndGradient(top, top_id, top_data_id);
       negative_objective += layer.Backward(top, true, &bottom);
+
       // Recover stepsize
       current_blob->mutable_cpu_data()[feat_id] += stepsize_;
       Dtype estimated_gradient = (positive_objective - negative_objective) /
           stepsize_ / 2.;
       Dtype feature = current_blob->cpu_data()[feat_id];
-      // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
-      //     << current_blob->cpu_diff()[feat_id];
+//       LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
+//           << current_blob->cpu_diff()[feat_id];
       if (kink_ - kink_range_ > feature || feature > kink_ + kink_range_) {
         // We check relative accuracy, but for too small values, we threshold
         // the scale factor by 1.
@@ -126,10 +129,12 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>& layer,
         EXPECT_LT(computed_gradient, estimated_gradient + threshold_ * scale)
           << "debug: (top_id, top_data_id, blob_id, feat_id)="
           << top_id << "," << top_data_id << "," << blobid << "," << feat_id;
+//        LOG(ERROR) << "computed gradient: " << computed_gradient
+//           << " estimated_gradient: " << estimated_gradient
+//           << " positive_objective: " << positive_objective
+//           << " negative_objective: " << negative_objective;
       }
-      // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
-      // LOG(ERROR) << "computed gradient: " << computed_gradient
-      //    << " estimated_gradient: " << estimated_gradient;
+      // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]
     }
   }
 }

From 837ce331d464b849dfbd5a1006869799beb4b570 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sun, 12 Jan 2014 00:39:45 +0800
Subject: [PATCH 04/15] Fix test stochastic pooling stepsize/threshold to be
 same as max pooling

---
 src/caffe/test/test_stochastic_pooing.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/caffe/test/test_stochastic_pooing.cpp b/src/caffe/test/test_stochastic_pooing.cpp
index e2b60eeec34..b8b07cb5999 100644
--- a/src/caffe/test/test_stochastic_pooing.cpp
+++ b/src/caffe/test/test_stochastic_pooing.cpp
@@ -140,8 +140,6 @@ TYPED_TEST(StochasticPoolingLayerTest, TestStochasticGPUTestPhase) {
   }
 }
 
-
-
 TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
   Caffe::set_mode(Caffe::GPU);
   Caffe::set_phase(Caffe::TRAIN);
@@ -151,12 +149,10 @@ TYPED_TEST(StochasticPoolingLayerTest, TestGradientGPU) {
 
   layer_param.set_pool(LayerParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam> layer(layer_param);
-  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  GradientChecker<TypeParam> checker(1e-4, 1e-2);
   // it is too expensive to call curand multiple times, so we don't do an
   // exhaustive gradient check.
   checker.CheckGradient(layer, this->blob_bottom_vec_, this->blob_top_vec_);
 }
 
-
-
 }

From 153868799735498e08b187bb7f919343a9af6096 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Sun, 12 Jan 2014 13:55:26 +0800
Subject: [PATCH 05/15] Fix math funcs, add tests, change Eigen Map to
 unaligned for lrn_layer

---
 include/caffe/blob.hpp                 |   8 +
 src/caffe/test/test_math_functions.cpp | 194 +++++++++++++++
 src/caffe/util/math_functions.cpp      | 322 +++++++++++++++----------
 3 files changed, 402 insertions(+), 122 deletions(-)
 create mode 100644 src/caffe/test/test_math_functions.cpp

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index f31d3b0f693..75cc3c67288 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -27,6 +27,14 @@ class Blob {
   inline int count() const {return count_; }
   inline int offset(const int n, const int c = 0, const int h = 0,
       const int w = 0) const {
+    CHECK_GE(n, 0);
+    CHECK_LE(n, num_);
+    CHECK_GE(channels_, 0);
+    CHECK_LE(c, channels_);
+    CHECK_GE(height_, 0);
+    CHECK_LE(h, height_);
+    CHECK_GE(width_, 0);
+    CHECK_LE(w, width_);
     return ((n * channels_ + c) * height_ + h) * width_ + w;
   }
   // Copy from source. If copy_diff is false, we copy the data; if copy_diff
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
new file mode 100644
index 00000000000..31973cc8619
--- /dev/null
+++ b/src/caffe/test/test_math_functions.cpp
@@ -0,0 +1,194 @@
+// Copyright 2013 Yangqing Jia
+
+#include <cmath>
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/util/math_functions.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class MathFunctionsTest : public ::testing::Test {
+ protected:
+  MathFunctionsTest()
+      : loops_(10)
+        ,a_(new Blob<Dtype>(2, 3, 6, 5))
+        ,b_(new Blob<Dtype>(2, 3, 6, 5))
+        ,y_(new Blob<Dtype>(2, 3, 6, 5))
+        ,a_cpu_data_(a_->cpu_data())
+        ,b_cpu_data_(b_->cpu_data())
+        ,y_cpu_data_(y_->mutable_cpu_data())
+        ,near_delta_(1e-5)
+ {};
+
+  virtual void SetUp() {
+    num_ = a_->count();
+    filler_param_.set_min(1e-5);
+    filler_param_.set_max(10);
+  };
+
+  virtual ~MathFunctionsTest() {
+    delete a_;
+    delete b_;
+    delete y_;
+  }
+
+  int loops_;
+  int num_;
+  Blob<Dtype>* a_;
+  Blob<Dtype>* b_;
+  Blob<Dtype>* y_;
+  const Dtype* const a_cpu_data_;
+  const Dtype* const b_cpu_data_;
+  Dtype* y_cpu_data_;
+  const Dtype near_delta_;
+  FillerParameter filler_param_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(MathFunctionsTest, Dtypes);
+
+TYPED_TEST(MathFunctionsTest, TestAdd) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  for (int l = 0; l < this->loops_; ++l) {
+    filler.Fill(this->a_);
+    filler.Fill(this->b_);
+    caffe_add(this->num_, this->a_cpu_data_, this->b_cpu_data_, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], this->a_cpu_data_[i] + this->b_cpu_data_[i], this->near_delta_);
+    }
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSub) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  for (int l = 0; l < this->loops_; ++l) {
+    filler.Fill(this->a_);
+    filler.Fill(this->b_);
+    caffe_sub(this->num_, this->a_cpu_data_, this->b_cpu_data_, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], this->a_cpu_data_[i] - this->b_cpu_data_[i], this->near_delta_);
+    }
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestMul) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  for (int l = 0; l < this->loops_; ++l) {
+    filler.Fill(this->a_);
+    filler.Fill(this->b_);
+    caffe_mul(this->num_, this->a_->cpu_data(), this->b_->cpu_data(), this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], this->a_cpu_data_[i] * this->b_cpu_data_[i], this->near_delta_);
+    }
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestDiv) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  UniformFiller<TypeParam> uniform_filler(this->filler_param_);
+  for (int l = 0; l < this->loops_; ++l) {
+    filler.Fill(this->a_);
+    filler.Fill(this->b_);
+    FillerParameter filler_param;
+    filler_param.set_min(1e-5); // to avoid dividing by zero
+    uniform_filler.Fill(this->b_);
+    caffe_div(this->num_, this->a_cpu_data_, this->b_cpu_data_, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], this->a_cpu_data_[i] /
+                  this->b_cpu_data_[i], this->near_delta_);
+    }
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestPowx) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  UniformFiller<TypeParam> uniform_filler(this->filler_param_);
+  TypeParam p;
+  for (int l = 0; l < this->loops_; ++l) {
+    p = 0;
+    filler.Fill(this->a_);
+    caffe_powx(this->num_, this->a_cpu_data_, p, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], std::pow(this->a_cpu_data_[i], p) ,
+                  this->near_delta_)
+       << "debug: (i, y_cpu_data_, a_cpu_data_, p)="
+       << i << "," << this->y_cpu_data_[i] << "," << this->a_cpu_data_[i]
+       << "," << p;
+    }
+
+    p = 0.5;
+    uniform_filler.Fill(this->a_);
+    caffe_powx(this->num_, this->a_cpu_data_, p, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], std::pow(this->a_cpu_data_[i], p) ,
+                  this->near_delta_)
+       << "debug: (i, y_cpu_data_, a_cpu_data_, p)="
+       << i << "," << this->y_cpu_data_[i] << "," << this->a_cpu_data_[i]
+       << "," << p;
+    }
+
+    p = -0.5;
+    uniform_filler.Fill(this->a_);
+    caffe_powx(this->num_, this->a_cpu_data_, p, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], std::pow(this->a_cpu_data_[i], p) ,
+                  this->near_delta_)
+       << "debug: (i, y_cpu_data_, a_cpu_data_, p)="
+       << i << "," << this->y_cpu_data_[i] << "," << this->a_cpu_data_[i]
+       << "," << p;
+    }
+
+    p = 1.5;
+    uniform_filler.Fill(this->a_);
+    caffe_powx(this->num_, this->a_cpu_data_, p, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], std::pow(this->a_cpu_data_[i], p) ,
+                  this->near_delta_)
+       << "debug: (i, y_cpu_data_, a_cpu_data_, p)="
+       << i << "," << this->y_cpu_data_[i] << "," << this->a_cpu_data_[i]
+       << "," << p;
+    }
+
+    p = -1.5;
+    uniform_filler.Fill(this->a_);
+    caffe_powx(this->num_, this->a_cpu_data_, p, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], std::pow(this->a_cpu_data_[i], p) ,
+                  this->near_delta_)
+       << "debug: (i, y_cpu_data_, a_cpu_data_, p)="
+       << i << "," << this->y_cpu_data_[i] << "," << this->a_cpu_data_[i]
+       << "," << p;
+    }
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSqr) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  for (int l = 0; l < this->loops_; ++l) {
+    filler.Fill(this->a_);
+    caffe_sqr(this->num_, this->a_cpu_data_, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], this->a_cpu_data_[i] * this->a_cpu_data_[i], this->near_delta_);
+    }
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestExp) {
+  GaussianFiller<TypeParam> filler(this->filler_param_);
+  for (int l = 0; l < this->loops_; ++l) {
+    filler.Fill(this->a_);
+    caffe_exp(this->num_, this->a_cpu_data_, this->y_cpu_data_);
+    for (int i = 0; i < this->num_; ++i) {
+      EXPECT_NEAR(this->y_cpu_data_[i], std::exp(this->a_cpu_data_[i]), this->near_delta_);
+    }
+  }
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 852cf6d6d38..f292972e4e2 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -12,11 +12,22 @@
 
 namespace caffe {
 
-const int data_alignment = Eigen::Aligned; // how is data allocated ?
-typedef Eigen::Map<const Eigen::VectorXf, data_alignment> const_map_vector_float_t;
-typedef Eigen::Map<Eigen::VectorXf, data_alignment> map_vector_float_t;
-typedef Eigen::Map<const Eigen::VectorXd, data_alignment> const_map_vector_double_t;
-typedef Eigen::Map<Eigen::VectorXd, data_alignment> map_vector_double_t;
+// Operations on aligned memory are faster than on unaligned memory.
+// But unfortunately, the pointers passed in are not always aligned.
+// Therefore, the memory-aligned Eigen::Map objects that wrap them
+// cannot be assigned to. This happens in lrn_layer and makes
+// test_lrn_layer crash with segmentation fault.
+// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned.
+
+// Though the default map option is unaligned, making it explicit is no harm.
+//const int data_alignment = Eigen::Aligned; // how is data allocated ?
+const int data_alignment = Eigen::Unaligned;
+typedef Eigen::Array<float, 1, Eigen::Dynamic> float_array_t;
+typedef Eigen::Map<const float_array_t, data_alignment> const_map_vector_float_t;
+typedef Eigen::Map<float_array_t, data_alignment> map_vector_float_t;
+typedef Eigen::Array<double, 1, Eigen::Dynamic> double_array_t;
+typedef Eigen::Map<const double_array_t, data_alignment> const_map_vector_double_t;
+typedef Eigen::Map<double_array_t, data_alignment> map_vector_double_t;
 
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
@@ -127,25 +138,6 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
   CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
-template <>
-void caffe_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-    // y := a*x + b*y
-    //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-    map_vector_float_t(Y, N) *= beta;
-    map_vector_float_t(Y, N) += (alpha * const_map_vector_float_t(X, N));
-
-}
-
-template <>
-void caffe_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-    // y := a*x + b*y
-  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-    map_vector_double_t(Y, N) *= beta;
-    map_vector_double_t(Y, N) += (alpha * const_map_vector_double_t(X, N));
-}
-
 template <>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
   cblas_scopy(N, X, 1, Y, 1);
@@ -201,189 +193,275 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_sqr<float>(const int n, const float* a, float* y) {
-  //vsSqr(n, a, y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().sqrt();
+void caffe_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  // y := a*x + b*y
+  //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+  CHECK_GE(N, 0);
+  CHECK(X);
+  CHECK(Y);
+  map_vector_float_t y_map(Y, N);
+  // Eigen produces optimized code using lasy evaluation
+  // http://eigen.tuxfamily.org/dox/TopicLazyEvaluation.html
+  y_map = const_map_vector_float_t(X, N) * alpha + y_map * beta;
 }
 
 template <>
-void caffe_sqr<double>(const int n, const double* a, double* y) {
-    //vdSqr(n, a, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().sqrt();
+void caffe_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+    // y := a*x + b*y
+  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+  CHECK_GE(N, 0);
+  CHECK(X);
+  CHECK(Y);
+  map_vector_double_t y_map(Y, N);
+  y_map = const_map_vector_double_t(X, N) * alpha + y_map * beta;
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsAdd(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) + const_map_vector_float_t(b, n);
+  //vsAdd(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) +
+      const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdAdd(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) + const_map_vector_double_t(b, n);
+  //vdAdd(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) +
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsSub(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n) - const_map_vector_float_t(b, n);
+  //vsSub(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) -
+      const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdSub(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n) - const_map_vector_double_t(b, n);
+  //vdSub(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) -
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsMul(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() * const_map_vector_float_t(b, n).array();
+  //vsMul(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
+       const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdMul(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() * const_map_vector_double_t(b, n).array();
+  //vdMul(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) *
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
     float* y) {
-    //vsDiv(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array() / const_map_vector_float_t(b, n).array();
+  //vsDiv(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) /
+      const_map_vector_float_t(b, n);
 }
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
     double* y) {
-    //vdDiv(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array() / const_map_vector_double_t(b, n).array();
+  //vdDiv(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(b);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) /
+      const_map_vector_double_t(b, n);
 }
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
     float* y) {
-    //vsPowx(n, a, b, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().pow(b);
+  //vsPowx(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).pow(b);
 }
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
     double* y) {
-    //vdPowx(n, a, b, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().pow(b);
+  //vdPowx(n, a, b, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).pow(b);
+}
+
+template <>
+void caffe_sqr<float>(const int n, const float* a, float* y) {
+  // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-F003F826-81BF-42EC-AE51-2EF624893133.htm
+  // v?Sqr Performs element by element squaring of the vector.
+  //vsSqr(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  caffe_powx<float>(n, a, 2, y);
+  // TODO: which is faster?
+//  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
+//      const_map_vector_float_t(a, n);
+}
+
+template <>
+void caffe_sqr<double>(const int n, const double* a, double* y) {
+  //vdSqr(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  caffe_powx<double>(n, a, 2, y);
+}
+
+template <>
+void caffe_exp<float>(const int n, const float* a, float* y) {
+  //vsExp(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).exp();
+}
+
+template <>
+void caffe_exp<double>(const int n, const double* a, double* y) {
+  //vdExp(n, a, y);
+  CHECK_GE(n, 0);
+  CHECK(a);
+  CHECK(y);
+  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).exp();
 }
 
 template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype, Dtype>(b, std::numeric_limits<Dtype>::max());
+  return boost::math::nextafter<Dtype, Dtype>(
+      b, std::numeric_limits<Dtype>::max());
 }
 
-template <>
-void caffe_vRngUniform<float>(const int n, float* r,
-    const float a, const float b) {
+template
+float caffe_nextafter(const float b);
+
+template
+double caffe_nextafter(const double b);
+
+template <typename Dtype>
+void caffe_vRngUniform(const int n, Dtype* r,
+    const Dtype a, const Dtype b) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
   //VSL_CHECK(vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
   //    n, r, a, b));
 
   // FIXME check if boundaries are handled in the same way ?
-  boost::random::uniform_real_distribution<float> random_distribution(
-      a, caffe_nextafter<float>(b));
+  // Fixed by caffe_nextafter
+  boost::random::uniform_real_distribution<Dtype> random_distribution(
+      a, caffe_nextafter<Dtype>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
-  for(int i = 0; i < n; i += 1)
-  {
-      r[i] = random_distribution(generator);
+  for(int i = 0; i < n; i += 1) {
+    r[i] = random_distribution(generator);
   }
 }
 
-template <>
+template
+void caffe_vRngUniform<float>(const int n, float* r,
+                                       const float a, const float b);
+template
 void caffe_vRngUniform<double>(const int n, double* r,
-    const double a, const double b) {
-  //VSL_CHECK(vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, Caffe::vsl_stream(),
-  //    n, r, a, b));
-
-    // FIXME check if boundaries are handled in the same way ?
-    boost::random::uniform_real_distribution<double> random_distribution(
-        a, caffe_nextafter<double>(b));
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+                                       const double a, const double b);
 
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
-}
-
-template <>
-void caffe_vRngGaussian<float>(const int n, float* r, const float a,
-    const float sigma) {
-    DCHECK(sigma > 0);
+template <typename Dtype>
+void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
+    const Dtype sigma) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
   //VSL_CHECK(vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
 //      Caffe::vsl_stream(), n, r, a, sigma));
 
     // FIXME check if parameters are handled in the same way ?
-    boost::normal_distribution<float> random_distribution(a, sigma);
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
+    // http://www.boost.org/doc/libs/1_55_0/doc/html/boost/random/normal_distribution.html
+    // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-63196F25-5013-4038-8BCD-2613C4EF3DE4.htm
+    // The above two documents show that the probability density functions are different.
+    // But the unit tests still pass. Maybe their codes are the same or
+    // the tests are irrelevant to the random numbers.
+  boost::normal_distribution<Dtype> random_distribution(a, sigma);
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
+  for(int i = 0; i < n; i += 1) {
+    r[i] = random_distribution(generator);
+  }
 }
 
+template
+void caffe_vRngGaussian<float>(const int n, float* r, const float a,
+    const float sigma);
 
-template <>
+template
 void caffe_vRngGaussian<double>(const int n, double* r, const double a,
-    const double sigma) {
-    DCHECK(sigma > 0);
-  //VSL_CHECK(vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER,
-  //    Caffe::vsl_stream(), n, r, a, sigma));
-
-    // FIXME check if parameters are handled in the same way ?
-    boost::normal_distribution<double> random_distribution(a, sigma);
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
-}
-
+    const double sigma);
 
 template <typename Dtype>
-void caffe_vRngBernoulli(const int n, Dtype* r, const double p)
-{
+void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
     // FIXME check if parameters are handled in the same way ?
-    boost::bernoulli_distribution<Dtype> random_distribution(p);
-    Caffe::random_generator_t &generator = Caffe::vsl_stream();
-
-    for(int i = 0; i < n; i += 1)
-    {
-        r[i] = random_distribution(generator);
-    }
-}
-
-template void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
-
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  Caffe::random_generator_t &generator = Caffe::vsl_stream();
 
-template <>
-void caffe_exp<float>(const int n, const float* a, float* y) {
-    //vsExp(n, a, y);
-    map_vector_float_t(y, n) = const_map_vector_float_t(a, n).array().exp();
+  for(int i = 0; i < n; i += 1) {
+    r[i] = random_distribution(generator);
+  }
 }
 
-template <>
-void caffe_exp<double>(const int n, const double* a, double* y) {
-    //vdExp(n, a, y);
-    map_vector_double_t(y, n) = const_map_vector_double_t(a, n).array().exp();
-}
+template
+void caffe_vRngBernoulli<int>(const int n, int* r, const double p);
 
 template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {

From 1488f13c964b4259a268742214482ffa90af7f1e Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@eecs.berkeley.edu>
Date: Wed, 8 Jan 2014 16:36:52 -0800
Subject: [PATCH 06/15] relax precision of MultinomialLogisticLossLayer test

---
 src/caffe/test/test_multinomial_logistic_loss_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
index b19e4cfee81..6bd94ae24b8 100644
--- a/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
@@ -54,7 +54,7 @@ TYPED_TEST(MultinomialLogisticLossLayerTest, TestGradientCPU) {
   Caffe::set_mode(Caffe::CPU);
   MultinomialLogisticLossLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, &this->blob_top_vec_);
-  GradientChecker<TypeParam> checker(1e-2, 1e-2, 1701, 0, 0.05);
+  GradientChecker<TypeParam> checker(1e-2, 2*1e-2, 1701, 0, 0.05);
   checker.CheckGradientSingle(layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0, -1, -1);
 }

From 34f675f785b7a5bb8b1011cddacbd0dcfd8274be Mon Sep 17 00:00:00 2001
From: Alejandro Dubrovsky <alito@organicrobot.com>
Date: Wed, 22 Jan 2014 22:56:17 +1100
Subject: [PATCH 07/15] nextafter templates off one type

---
 src/caffe/util/math_functions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index f292972e4e2..2a0c7c4e453 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -377,7 +377,7 @@ void caffe_exp<double>(const int n, const double* a, double* y) {
 
 template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype, Dtype>(
+  return boost::math::nextafter<Dtype>(
       b, std::numeric_limits<Dtype>::max());
 }
 

From f15dc9ef5236a73e88458be1cdab8ea89fbd9245 Mon Sep 17 00:00:00 2001
From: Alejandro Dubrovsky <alito@organicrobot.com>
Date: Wed, 22 Jan 2014 22:56:57 +1100
Subject: [PATCH 08/15] mean_bound and sample_mean need referencing with this

---
 src/caffe/test/test_random_number_generator.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
index 4c3358f9f49..26c9f2e32e0 100644
--- a/src/caffe/test/test_random_number_generator.cpp
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -43,8 +43,8 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
   caffe_vRngGaussian(sample_size, (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
   TypeParam true_mean = mu;
   TypeParam true_std = sigma;
-  TypeParam bound = mean_bound(true_std, sample_size);
-  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
   EXPECT_NEAR(real_mean, true_mean, bound);
 }
 
@@ -57,8 +57,8 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
   caffe_vRngUniform(sample_size, (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
   TypeParam true_mean = (lower + upper) / 2;
   TypeParam true_std = (upper - lower) / sqrt(12);
-  TypeParam bound = mean_bound(true_std, sample_size);
-  TypeParam real_mean = sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
   EXPECT_NEAR(real_mean, true_mean, bound);
 }
 

From 2792f02d89b0a8b1d0e3849ed72795c25b291c7c Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 22 Jan 2014 12:14:09 -0800
Subject: [PATCH 09/15] make uniform distribution usage compatible with boost
 1.46

---
 src/caffe/util/math_functions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 2a0c7c4e453..8d7b0d07e88 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -398,7 +398,7 @@ void caffe_vRngUniform(const int n, Dtype* r,
 
   // FIXME check if boundaries are handled in the same way ?
   // Fixed by caffe_nextafter
-  boost::random::uniform_real_distribution<Dtype> random_distribution(
+  boost::uniform_real<Dtype> random_distribution(
       a, caffe_nextafter<Dtype>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
 

From ad40955f56be978516cfc53809eb227ef3951138 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 22 Jan 2014 12:28:01 -0800
Subject: [PATCH 10/15] use boost variate_generator to pass tests w/ boost 1.46
 (Gaussian filler previously filled in all NaNs for me, making many tests
 fail)

---
 src/caffe/util/math_functions.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 8d7b0d07e88..a1c47caf3b7 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -431,9 +431,12 @@ void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
     // the tests are irrelevant to the random numbers.
   boost::normal_distribution<Dtype> random_distribution(a, sigma);
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  boost::variate_generator<Caffe::random_generator_t,
+      boost::normal_distribution<Dtype> > variate_generator(
+      generator, random_distribution);
 
-  for(int i = 0; i < n; i += 1) {
-    r[i] = random_distribution(generator);
+  for(int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
   }
 }
 

From 430475bbb904c73f636489eb6a9f8c8176f9cddf Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 22 Jan 2014 12:42:12 -0800
Subject: [PATCH 11/15] change all Rng's to use variate_generator for
 consistency

---
 src/caffe/util/math_functions.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index a1c47caf3b7..07b1aa01fd7 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -401,9 +401,12 @@ void caffe_vRngUniform(const int n, Dtype* r,
   boost::uniform_real<Dtype> random_distribution(
       a, caffe_nextafter<Dtype>(b));
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  boost::variate_generator<Caffe::random_generator_t,
+      boost::uniform_real<Dtype> > variate_generator(
+      generator, random_distribution);
 
-  for(int i = 0; i < n; i += 1) {
-    r[i] = random_distribution(generator);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
   }
 }
 
@@ -435,7 +438,7 @@ void caffe_vRngGaussian(const int n, Dtype* r, const Dtype a,
       boost::normal_distribution<Dtype> > variate_generator(
       generator, random_distribution);
 
-  for(int i = 0; i < n; ++i) {
+  for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
 }
@@ -457,9 +460,12 @@ void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
     // FIXME check if parameters are handled in the same way ?
   boost::bernoulli_distribution<Dtype> random_distribution(p);
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
+  boost::variate_generator<Caffe::random_generator_t,
+      boost::bernoulli_distribution<Dtype> > variate_generator(
+      generator, random_distribution);
 
-  for(int i = 0; i < n; i += 1) {
-    r[i] = random_distribution(generator);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
   }
 }
 

From b08a2d97c5b9f6123724e6a3ae99e90f1bf4e3d7 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 29 Jan 2014 13:03:42 -0800
Subject: [PATCH 12/15] add bernoulli rng test to demonstrate bug (generates
 all 0s unless p == 1)

---
 .../test/test_random_number_generator.cpp     | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp
index 26c9f2e32e0..c43a5d9404c 100644
--- a/src/caffe/test/test_random_number_generator.cpp
+++ b/src/caffe/test/test_random_number_generator.cpp
@@ -24,6 +24,15 @@ class RandomNumberGeneratorTest : public ::testing::Test {
       return sum / sample_size;
   }
 
+  Dtype sample_mean(const int* const seqs, const size_t sample_size)
+  {
+      Dtype sum = 0;
+      for (int i = 0; i < sample_size; ++i) {
+          sum += Dtype(seqs[i]);
+      }
+      return sum / sample_size;
+  }
+
   Dtype mean_bound(const Dtype std, const size_t sample_size)
   {
       return  std/sqrt((double)sample_size);
@@ -40,28 +49,47 @@ TYPED_TEST(RandomNumberGeneratorTest, TestRngGaussian) {
   Caffe::set_random_seed(1701);
   TypeParam mu = 0;
   TypeParam sigma = 1;
-  caffe_vRngGaussian(sample_size, (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
+  caffe_vRngGaussian(sample_size,
+      (TypeParam*)data_a.mutable_cpu_data(), mu, sigma);
   TypeParam true_mean = mu;
   TypeParam true_std = sigma;
   TypeParam bound = this->mean_bound(true_std, sample_size);
-  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
-  EXPECT_NEAR(real_mean, true_mean, bound);
+  TypeParam empirical_mean =
+      this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
 }
 
+
 TYPED_TEST(RandomNumberGeneratorTest, TestRngUniform) {
   size_t sample_size = 10000;
   SyncedMemory data_a(sample_size * sizeof(TypeParam));
   Caffe::set_random_seed(1701);
   TypeParam lower = 0;
   TypeParam upper = 1;
-  caffe_vRngUniform(sample_size, (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
+  caffe_vRngUniform(sample_size,
+      (TypeParam*)data_a.mutable_cpu_data(), lower, upper);
   TypeParam true_mean = (lower + upper) / 2;
   TypeParam true_std = (upper - lower) / sqrt(12);
   TypeParam bound = this->mean_bound(true_std, sample_size);
-  TypeParam real_mean = this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
-  EXPECT_NEAR(real_mean, true_mean, bound);
+  TypeParam empirical_mean =
+      this->sample_mean((TypeParam*)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
 }
 
 
+TYPED_TEST(RandomNumberGeneratorTest, TestRngBernoulli) {
+  size_t sample_size = 10000;
+  SyncedMemory data_a(sample_size * sizeof(int));
+  Caffe::set_random_seed(1701);
+  double p = 0.3;
+  caffe_vRngBernoulli(sample_size, (int*)data_a.mutable_cpu_data(), p);
+  TypeParam true_mean = p;
+  TypeParam true_std = sqrt(p * (1 - p));
+  TypeParam bound = this->mean_bound(true_std, sample_size);
+  TypeParam empirical_mean =
+      this->sample_mean((const int *)data_a.cpu_data(), sample_size);
+  EXPECT_NEAR(empirical_mean, true_mean, bound);
+}
+
 
 }  // namespace caffe

From 10d68228aad09c8e27aa7824aa3ed3ab2b25fdef Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 29 Jan 2014 13:11:34 -0800
Subject: [PATCH 13/15] fix bernoulli generator bug

---
 src/caffe/util/math_functions.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 07b1aa01fd7..8538d47f5bd 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -457,11 +457,10 @@ void caffe_vRngBernoulli(const int n, Dtype* r, const double p) {
   CHECK(r);
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
-    // FIXME check if parameters are handled in the same way ?
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::bernoulli_distribution<double> random_distribution(p);
   Caffe::random_generator_t &generator = Caffe::vsl_stream();
   boost::variate_generator<Caffe::random_generator_t,
-      boost::bernoulli_distribution<Dtype> > variate_generator(
+      boost::bernoulli_distribution<double> > variate_generator(
       generator, random_distribution);
 
   for (int i = 0; i < n; ++i) {

From 33e7141add05a933891265abc6ef6e02c86cb3b2 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Fri, 7 Feb 2014 18:44:10 +0800
Subject: [PATCH 14/15] Replace atlas with multithreaded OpenBLAS to speed-up
 on multi-core CPU

issue: #79
---
 Makefile | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 10db1b9f4bf..99eaa103fd0 100644
--- a/Makefile
+++ b/Makefile
@@ -68,13 +68,10 @@ MKL_INCLUDE_DIR := $(MKL_DIR)/include
 MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
 INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
-LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR) /usr/lib/atlas-base
-LIBRARIES := cudart cublas curand protobuf \
-        opencv_core opencv_highgui opencv_imgproc \
-	glog \
-	atlas cblas \
-	leveldb snappy pthread boost_system 
-	# mkl_rt mkl_intel_thread 
+LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
+LIBRARIES := cudart cublas curand pthread openblas \
+	glog protobuf leveldb snappy boost_system \
+	opencv_core opencv_highgui opencv_imgproc
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall
 
@@ -82,7 +79,7 @@ COMMON_FLAGS := -DNDEBUG -O2 $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS)
 NVCCFLAGS := -ccbin=$(CXX) -Xcompiler -fPIC $(COMMON_FLAGS)
 LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
-		$(foreach library,$(LIBRARIES),-l$(library)) -Wl,-rpath=/usr/lib/atlas-base
+		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
 

From 9bba82096c5ec36f61f2dc2c701b109cf719ec94 Mon Sep 17 00:00:00 2001
From: Rowland Depp <depp.rowland@gmail.com>
Date: Tue, 11 Feb 2014 21:41:01 -0800
Subject: [PATCH 15/15] major refactoring allow coexistence of MKL and non-MKL
 cases

---
 Makefile                              |  10 +-
 Makefile.config.example               |   2 +
 include/caffe/util/math_functions.hpp |   7 +-
 include/caffe/util/mkl_alternate.hpp  |  95 ++++++++++++++++
 src/caffe/layers/loss_layer.cu        |   2 +-
 src/caffe/solver.cpp                  |   2 +-
 src/caffe/util/math_functions.cpp     | 150 ++++----------------------
 7 files changed, 132 insertions(+), 136 deletions(-)
 create mode 100644 include/caffe/util/mkl_alternate.hpp

diff --git a/Makefile b/Makefile
index 99eaa103fd0..0792c10d467 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ MKL_LIB_DIR := $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
 
 INCLUDE_DIRS += ./src ./include $(CUDA_INCLUDE_DIR) $(MKL_INCLUDE_DIR)
 LIBRARY_DIRS += $(CUDA_LIB_DIR) $(MKL_LIB_DIR)
-LIBRARIES := cudart cublas curand pthread openblas \
+LIBRARIES := cudart cublas curand pthread \
 	glog protobuf leveldb snappy boost_system \
 	opencv_core opencv_highgui opencv_imgproc
 PYTHON_LIBRARIES := boost_python python2.7
@@ -82,6 +82,14 @@ LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
+# MKL options
+ifdef USE_MKL
+  LIBRARIES += mkl_rt
+  COMMON_FLAGS += -DUSE_MKL
+else
+  LIBRARIES += atlas cblas
+endif
+
 
 ##############################
 # Define build targets
diff --git a/Makefile.config.example b/Makefile.config.example
index dd5b2360765..c3a1d8f03ab 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -7,6 +7,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
     -gencode arch=compute_30,code=sm_30 \
     -gencode arch=compute_35,code=sm_35
 
+# If not using MKL, comment out the following line.
+# USE_MKL=1
 # MKL directory contains include/ and lib/ directions that we need.
 MKL_DIR := /opt/intel/mkl
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index a4c63de0440..9b965e5cd82 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -3,10 +3,11 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-//#include <mkl.h>
-#include <cblas.h>
+
 #include <cublas_v2.h>
 
+#include "caffe/util/mkl_alternate.hpp"
+
 namespace caffe {
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
@@ -45,7 +46,7 @@ void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
     Dtype* Y);
 
 template <typename Dtype>
-void caffe_axpby(const int N, const Dtype alpha, const Dtype* X,
+void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
     const Dtype beta, Dtype* Y);
 
 template <typename Dtype>
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
new file mode 100644
index 00000000000..1c207c6782c
--- /dev/null
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -0,0 +1,95 @@
+// Copyright 2013 Rowland Depp
+
+#ifndef CAFFE_UTIL_MKL_ALTERNATE_H_
+#define CAFFE_UTIL_MKL_ALTERNATE_H_
+
+#ifdef USE_MKL
+
+#include <mkl.h>
+
+#else  // If use MKL, simply include the MKL header
+
+#include <cblas.h>
+#include <math.h>
+
+// Functions that caffe uses but are not present if MKL is not linked.
+
+// A simple way to define the vsl unary functions. The operation should
+// be in the form e.g. y[i] = sqrt(a[i])
+#define DEFINE_VSL_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, float* y) { \
+    v##name<float>(n, a, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, double* y) { \
+    v##name<double>(n, a, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
+
+// A simple way to define the vsl unary functions with singular parameter b.
+// The operation should be in the form e.g. y[i] = pow(a[i], b)
+#define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const float b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+
+// A simple way to define the vsl binary functions. The operation should
+// be in the form e.g. y[i] = a[i] + b[i]
+#define DEFINE_VSL_BINARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(b); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float* b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const double* b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+
+// In addition, MKL comes with an additional function axpby that is not present
+// in standard blas. We will simply use a two-step (inefficient, of course) way
+// to mimic that.
+inline void cblas_saxpby(const int N, const float alpha, const float* X,
+                         const int incX, const float beta, float* Y,
+                         const int incY) {
+  cblas_sscal(N, beta, Y, incY);
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_daxpby(const int N, const double alpha, const double* X,
+                         const int incX, const double beta, double* Y,
+                         const int incY) {
+  cblas_dscal(N, beta, Y, incY);
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+
+#endif  // USE_MKL
+#endif  // CAFFE_UTIL_MKL_ALTERNATE_H_
diff --git a/src/caffe/layers/loss_layer.cu b/src/caffe/layers/loss_layer.cu
index ac05ba41b84..b04419eaafd 100644
--- a/src/caffe/layers/loss_layer.cu
+++ b/src/caffe/layers/loss_layer.cu
@@ -117,7 +117,7 @@ Dtype EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   Dtype loss = caffe_cpu_dot(
       count, difference_.cpu_data(), difference_.cpu_data()) / num / Dtype(2);
   // Compute the gradient
-  caffe_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
+  caffe_cpu_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
       (*bottom)[0]->mutable_cpu_diff());
   return loss;
 }
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 340bbe1dc04..18f479ae1a4 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -214,7 +214,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
       // Compute the value to history, and then copy them to the blob's diff.
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-      caffe_axpby(net_params[param_id]->count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
           net_params[param_id]->cpu_diff(), momentum,
           history_[param_id]->mutable_cpu_data());
       if (local_decay) {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 8538d47f5bd..bc96b3d2a36 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -2,7 +2,6 @@
 
 #include <limits>
 //#include <mkl.h>
-#include <eigen3/Eigen/Dense>
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
@@ -12,23 +11,6 @@
 
 namespace caffe {
 
-// Operations on aligned memory are faster than on unaligned memory.
-// But unfortunately, the pointers passed in are not always aligned.
-// Therefore, the memory-aligned Eigen::Map objects that wrap them
-// cannot be assigned to. This happens in lrn_layer and makes
-// test_lrn_layer crash with segmentation fault.
-// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned.
-
-// Though the default map option is unaligned, making it explicit is no harm.
-//const int data_alignment = Eigen::Aligned; // how is data allocated ?
-const int data_alignment = Eigen::Unaligned;
-typedef Eigen::Array<float, 1, Eigen::Dynamic> float_array_t;
-typedef Eigen::Map<const float_array_t, data_alignment> const_map_vector_float_t;
-typedef Eigen::Map<float_array_t, data_alignment> map_vector_float_t;
-typedef Eigen::Array<double, 1, Eigen::Dynamic> double_array_t;
-typedef Eigen::Map<const double_array_t, data_alignment> const_map_vector_double_t;
-typedef Eigen::Map<double_array_t, data_alignment> map_vector_double_t;
-
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
@@ -125,7 +107,6 @@ template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
     double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
 
-
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
     float* Y) {
@@ -193,186 +174,95 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  // y := a*x + b*y
-  //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-  CHECK_GE(N, 0);
-  CHECK(X);
-  CHECK(Y);
-  map_vector_float_t y_map(Y, N);
-  // Eigen produces optimized code using lasy evaluation
-  // http://eigen.tuxfamily.org/dox/TopicLazyEvaluation.html
-  y_map = const_map_vector_float_t(X, N) * alpha + y_map * beta;
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+                            const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
-void caffe_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-    // y := a*x + b*y
-  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-  CHECK_GE(N, 0);
-  CHECK(X);
-  CHECK(Y);
-  map_vector_double_t y_map(Y, N);
-  y_map = const_map_vector_double_t(X, N) * alpha + y_map * beta;
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+                             const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsAdd(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) +
-      const_map_vector_float_t(b, n);
+  vsAdd(n, a, b, y);
 }
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdAdd(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) +
-      const_map_vector_double_t(b, n);
+  vdAdd(n, a, b, y);
 }
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsSub(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) -
-      const_map_vector_float_t(b, n);
+  vsSub(n, a, b, y);
 }
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdSub(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) -
-      const_map_vector_double_t(b, n);
+  vdSub(n, a, b, y);
 }
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsMul(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
-       const_map_vector_float_t(b, n);
+  vsMul(n, a, b, y);
 }
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdMul(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) *
-      const_map_vector_double_t(b, n);
+  vdMul(n, a, b, y);
 }
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsDiv(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) /
-      const_map_vector_float_t(b, n);
+  vsDiv(n, a, b, y);
 }
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdDiv(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) /
-      const_map_vector_double_t(b, n);
+  vdDiv(n, a, b, y);
 }
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
     float* y) {
-  //vsPowx(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).pow(b);
+  vsPowx(n, a, b, y);
 }
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
     double* y) {
-  //vdPowx(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).pow(b);
+  vdPowx(n, a, b, y);
 }
 
 template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-  // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-F003F826-81BF-42EC-AE51-2EF624893133.htm
-  // v?Sqr Performs element by element squaring of the vector.
-  //vsSqr(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  caffe_powx<float>(n, a, 2, y);
-  // TODO: which is faster?
-//  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
-//      const_map_vector_float_t(a, n);
+  vsSqr(n, a, y);
 }
 
 template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-  //vdSqr(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  caffe_powx<double>(n, a, 2, y);
+  vdSqr(n, a, y);
 }
 
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-  //vsExp(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).exp();
+  vsExp(n, a, y);
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-  //vdExp(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).exp();
+  vdExp(n, a, y);
 }
 
 template <typename Dtype>