From 03c8a619e7fbfa98b8c477b42c1faf128aec750b Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 18:23:04 +0800
Subject: [PATCH 1/8] Add __builtin_popcount* based fast Hamming distance math
 function

---
 include/caffe/util/math_functions.hpp  |  4 ++
 src/caffe/test/test_math_functions.cpp | 77 ++++++++++++++++++++++++++
 src/caffe/util/math_functions.cpp      | 23 ++++++++
 3 files changed, 104 insertions(+)
 create mode 100644 src/caffe/test/test_math_functions.cpp
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index e9e2db8f274..26abb2d02c2 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -1,4 +1,5 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 kloudkl@github
 
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
@@ -100,6 +101,9 @@ Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 template <typename Dtype>
 void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
+template <typename Dtype>
+int caffe_hamming_distance(const int n, const Dtype* x, const Dtype* y);
+
 }  // namespace caffe
 
 
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
new file mode 100644
index 00000000000..0e313eefb4f
--- /dev/null
+++ b/src/caffe/test/test_math_functions.cpp
@@ -0,0 +1,77 @@
+// Copyright 2014 kloudkl@github
+
+#include <stdint.h> // for uint32_t & uint64_t
+
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/util/math_functions.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template<typename Dtype>
+class MathFunctionsTest : public ::testing::Test {
+ protected:
+  MathFunctionsTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {
+  }
+
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    this->blob_bottom_->Reshape(100, 70, 50, 30);
+    this->blob_top_->Reshape(100, 70, 50, 30);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_top_);
+  }
+
+  virtual ~MathFunctionsTest() {
+    delete blob_bottom_;
+    delete blob_top_;
+  }
+  // http://en.wikipedia.org/wiki/Hamming_distance
+  int ReferenceHammingDistance(const int n, const Dtype* x, const Dtype* y);
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+};
+
+#define REF_HAMMING_DIST(float_type, int_type) \
+template<> \
+int MathFunctionsTest<float_type>::ReferenceHammingDistance(const int n, \
+                                                       const float_type* x, \
+                                                       const float_type* y) { \
+  int dist = 0; \
+  int_type val; \
+  for (int i = 0; i < n; ++i) { \
+    val = static_cast<int_type>(x[i]) ^ static_cast<int_type>(y[i]); \
+    /* Count the number of set bits */ \
+    while (val) { \
+      ++dist; \
+      val &= val - 1; \
+    } \
+  } \
+  return dist; \
+}
+
+REF_HAMMING_DIST(float, uint32_t);
+REF_HAMMING_DIST(double, uint64_t);
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(MathFunctionsTest, Dtypes);
+
+TYPED_TEST(MathFunctionsTest, TestHammingDistance){
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  const TypeParam* y = this->blob_top_->cpu_data();
+  CHECK_EQ(this->ReferenceHammingDistance(n, x, y),
+           caffe_hamming_distance<TypeParam>(n, x, y));
+}
+
+}
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 60656b87093..790f00eaf0e 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,4 +1,5 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 kloudkl@github
 
 #include <mkl.h>
 #include <cublas_v2.h>
@@ -293,4 +294,26 @@ void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
   CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
+template <>
+int caffe_hamming_distance<float>(const int n, const float* x,
+                                  const float* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
+                               static_cast<uint32_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+int caffe_hamming_distance<double>(const int n, const double* x,
+                                   const double* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
+                                static_cast<uint64_t>(y[i]));
+  }
+  return dist;
+}
+
 }  // namespace caffe

From 0c30c067c1162f5b8780a8b1c6fd4d974b3d46e8 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 18:41:45 +0800
Subject: [PATCH 2/8] Add and test sum of absolute values math functions for
 CPU and GPU

---
 include/caffe/util/math_functions.hpp  |  7 +++++++
 src/caffe/test/test_math_functions.cpp | 24 ++++++++++++++++++++++++
 src/caffe/util/math_functions.cpp      | 20 ++++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 26abb2d02c2..8271c768a66 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -104,6 +104,13 @@ void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 template <typename Dtype>
 int caffe_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
+// Returns the sum of the absolute values of the elements of vector x
+template <typename Dtype>
+Dtype caffe_cpu_asum(const int n, const Dtype* x);
+
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
+
 }  // namespace caffe
 
 
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index 0e313eefb4f..c7ba4803a5f 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -1,6 +1,7 @@
 // Copyright 2014 kloudkl@github
 
 #include <stdint.h> // for uint32_t & uint64_t
+#include <cmath> // for std::fabs
 
 #include "gtest/gtest.h"
 #include "caffe/blob.hpp"
@@ -74,4 +75,27 @@ TYPED_TEST(MathFunctionsTest, TestHammingDistance){
            caffe_hamming_distance<TypeParam>(n, x, y));
 }
 
+TYPED_TEST(MathFunctionsTest, TestAsumCPU){
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  TypeParam std_asum = 0;
+  for (int i = 0; i < n; ++i) {
+    std_asum += std::fabs(x[i]);
+  }
+  TypeParam cpu_asum = caffe_cpu_asum<TypeParam>(n, x);
+  CHECK_LT((cpu_asum - std_asum) / std_asum, 1e-2);
+}
+
+TYPED_TEST(MathFunctionsTest, TestAsumGPU){
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  TypeParam std_asum = 0;
+  for (int i = 0; i < n; ++i) {
+    std_asum += std::fabs(x[i]);
+  }
+  TypeParam gpu_asum;
+  caffe_gpu_asum<TypeParam>(n, this->blob_bottom_->gpu_data(), &gpu_asum);
+  CHECK_LT((gpu_asum - std_asum) / std_asum, 1e-2);
+}
+
 }
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 790f00eaf0e..5ff4e806481 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -316,4 +316,24 @@ int caffe_hamming_distance<double>(const int n, const double* x,
   return dist;
 }
 
+template <>
+float caffe_cpu_asum<float>(const int n, const float* x) {
+  return cblas_sasum(n, x, 1);
+}
+
+template <>
+double caffe_cpu_asum<double>(const int n, const double* x) {
+  return cblas_dasum(n, x, 1);
+}
+
+template <>
+void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
+  CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
+}
+
+template <>
+void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
+  CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
+}
+
 }  // namespace caffe

From 109f10e5d484c89dae804276522d348a9804bd33 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 19:16:44 +0800
Subject: [PATCH 3/8] Add and test element wise sign math funtions for CPU and
 GPU

---
 include/caffe/util/math_functions.hpp  | 17 +++++++++++++++++
 src/caffe/test/test_math_functions.cpp | 21 +++++++++++++++++++++
 src/caffe/util/math_functions.cu       | 20 ++++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 8271c768a66..c828d69b9e3 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -111,6 +111,23 @@ Dtype caffe_cpu_asum(const int n, const Dtype* x);
 template <typename Dtype>
 void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
 
+// the branchless, type-safe version from
+// http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
+template<typename Dtype>
+inline char caffe_sign(Dtype val) {
+  return (Dtype(0) < val) - (val < Dtype(0));
+}
+
+template<typename Dtype>
+void caffe_cpu_sign(const int n, const Dtype* x, Dtype* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = caffe_sign<Dtype>(x[i]);
+  }
+}
+
+template<typename Dtype>
+void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
+
 }  // namespace caffe
 
 
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index c7ba4803a5f..09b4aa67719 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -98,4 +98,25 @@ TYPED_TEST(MathFunctionsTest, TestAsumGPU){
   CHECK_LT((gpu_asum - std_asum) / std_asum, 1e-2);
 }
 
+TYPED_TEST(MathFunctionsTest, TestSignCPU){
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  caffe_cpu_sign<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* signs = this->blob_bottom_->cpu_diff();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSignGPU){
+  int n = this->blob_bottom_->count();
+  caffe_gpu_sign<TypeParam>(n, this->blob_bottom_->gpu_data(),
+                            this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* signs = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signs[i], x[i] > 0 ? 1 : (x[i] < 0 ? -1 : 0));
+  }
+}
+
 }
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index b29a58abe7f..e6d50baa2e7 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -1,4 +1,5 @@
 // Copyright 2013 Yangqing Jia
+// Copyright 2014 kloudkl@github
 
 #include <cmath>
 #include <cstdlib>
@@ -34,5 +35,24 @@ void caffe_gpu_mul<double>(const int N, const double* a,
       N, a, b, y);
 }
 
+template<typename Dtype>
+__global__ void sign_kernel(const int n, const Dtype* x, Dtype* y) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    y[index] = (Dtype(0) < x[index]) - (x[index] < Dtype(0));
+  }
+}
+
+template <>
+void caffe_gpu_sign<float>(const int n, const float* x, float* y) {
+  sign_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+      n, x, y);
+}
+
+template <>
+void caffe_gpu_sign<double>(const int n, const double* x, double* y) {
+  sign_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+      n, x, y);
+}
 
 }  // namespace caffe

From 573dc2cde2fb718e5ba8d2cefe31a62ff7577167 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 19:21:18 +0800
Subject: [PATCH 4/8] Instantiate caffe_cpu_sign for float and double

---
 src/caffe/util/math_functions.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 5ff4e806481..11b31d78f80 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -336,4 +336,10 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
   CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
+template <>
+void caffe_cpu_sign<float>(const int n, const float* x, float* y);
+
+template <>
+void caffe_cpu_sign<double>(const int n, const double* x, double* y);
+
 }  // namespace caffe

From 2f09616d8e6b9bd745a5b12d16957994c18cbb17 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 19:33:10 +0800
Subject: [PATCH 5/8] Add and test element wise abs math functions for CPU and
 GPU

---
 include/caffe/util/math_functions.hpp  | 11 +++++++++++
 src/caffe/test/test_math_functions.cpp | 21 +++++++++++++++++++++
 src/caffe/util/math_functions.cpp      |  6 ++++++
 src/caffe/util/math_functions.cu       | 21 +++++++++++++++++++++
 4 files changed, 59 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index c828d69b9e3..3f8beec593d 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -4,6 +4,7 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
+#include <cmath> // for std::fabs
 #include <mkl.h>
 #include <cublas_v2.h>
 
@@ -128,6 +129,16 @@ void caffe_cpu_sign(const int n, const Dtype* x, Dtype* y) {
 template<typename Dtype>
 void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
+template <typename Dtype>
+void caffe_cpu_fabs(const int n, const Dtype* x, Dtype* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::fabs(x[i]);
+  }
+}
+
+template <typename Dtype>
+void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
+
 }  // namespace caffe
 
 
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index 09b4aa67719..0f46273d257 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -119,4 +119,25 @@ TYPED_TEST(MathFunctionsTest, TestSignGPU){
   }
 }
 
+TYPED_TEST(MathFunctionsTest, TestFabsCPU){
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  caffe_cpu_fabs<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestFabsGPU){
+  int n = this->blob_bottom_->count();
+  caffe_gpu_fabs<TypeParam>(n, this->blob_bottom_->gpu_data(),
+                            this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* abs_val = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(abs_val[i], x[i] > 0 ? x[i] : -x[i]);
+  }
+}
+
 }
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 11b31d78f80..ecee2f2e2ee 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -342,4 +342,10 @@ void caffe_cpu_sign<float>(const int n, const float* x, float* y);
 template <>
 void caffe_cpu_sign<double>(const int n, const double* x, double* y);
 
+template <>
+void caffe_cpu_fabs<float>(const int n, const float* x, float* y);
+
+template <>
+void caffe_cpu_fabs<double>(const int n, const double* x, double* y);
+
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index e6d50baa2e7..8ecd72c49d0 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
+#include <math_functions.h> // CUDA's, not caffe's, for fabs
 
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -55,4 +56,24 @@ void caffe_gpu_sign<double>(const int n, const double* x, double* y) {
       n, x, y);
 }
 
+template<typename Dtype>
+__global__ void fabs_kernel(const int n, const Dtype* x, Dtype* y) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    y[index] = fabs(x[index]);
+  }
+}
+
+template <>
+void caffe_gpu_fabs<float>(const int n, const float* x, float* y) {
+  fabs_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+      n, x, y);
+}
+
+template <>
+void caffe_gpu_fabs<double>(const int n, const double* x, double* y) {
+  fabs_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+      n, x, y);
+}
+
 }  // namespace caffe

From f21ff16f7c2708508b9cf678efc9599bb3ab1594 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 20:06:05 +0800
Subject: [PATCH 6/8] Use macro to simplify element wise cpu math functions

---
 include/caffe/util/math_functions.hpp | 33 ++++++++++++++++++---------
 src/caffe/util/math_functions.cpp     | 13 ++---------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 3f8beec593d..895b5752c2c 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -119,22 +119,33 @@ inline char caffe_sign(Dtype val) {
   return (Dtype(0) < val) - (val < Dtype(0));
 }
 
-template<typename Dtype>
-void caffe_cpu_sign(const int n, const Dtype* x, Dtype* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = caffe_sign<Dtype>(x[i]);
+// The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
+//   in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp.
+// Please refer to commit 7e8ef25c7 of the boost-eigen branch.
+// Git cherry picking that commit caused a conflict hard to resolve and
+//   copying that file in convenient for code reviewing.
+// So they have to be pasted here temporarily.
+#define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(x); CHECK(y); \
+    for (int i = 0; i < n; ++i) { \
+      operation; \
+    } \
   }
-}
+
+#define INSTANTIATE_CAFFE_CPU_UNARY_FUNC(name) \
+  template <> \
+  void caffe_cpu_##name<float>(const int n, const float* x, float* y); \
+  template <> \
+  void caffe_cpu_##name<double>(const int n, const double* x, double* y)
+
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 
 template<typename Dtype>
 void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
-template <typename Dtype>
-void caffe_cpu_fabs(const int n, const Dtype* x, Dtype* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::fabs(x[i]);
-  }
-}
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 
 template <typename Dtype>
 void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index ecee2f2e2ee..50e5662e207 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -336,16 +336,7 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
   CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
-template <>
-void caffe_cpu_sign<float>(const int n, const float* x, float* y);
-
-template <>
-void caffe_cpu_sign<double>(const int n, const double* x, double* y);
-
-template <>
-void caffe_cpu_fabs<float>(const int n, const float* x, float* y);
-
-template <>
-void caffe_cpu_fabs<double>(const int n, const double* x, double* y);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
 
 }  // namespace caffe

From a5333f7804762383671c85427337a19f3bbc52bf Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Tue, 25 Feb 2014 20:26:55 +0800
Subject: [PATCH 7/8] Add and test non-in-place scale math functions for CPU
 and GPU

---
 include/caffe/util/math_functions.hpp  |  6 ++++++
 src/caffe/test/test_math_functions.cpp | 26 ++++++++++++++++++++++++
 src/caffe/util/math_functions.cpp      | 28 ++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 895b5752c2c..222e29a0134 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -150,6 +150,12 @@ DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 template <typename Dtype>
 void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
 
+template <typename Dtype>
+void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
 }  // namespace caffe
 
 
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index 0f46273d257..00f28badb86 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -140,4 +140,30 @@ TYPED_TEST(MathFunctionsTest, TestFabsGPU){
   }
 }
 
+TYPED_TEST(MathFunctionsTest, TestScaleCPU){
+  int n = this->blob_bottom_->count();
+  TypeParam alpha = this->blob_bottom_->cpu_diff()[rand() %
+                                               this->blob_bottom_->count()];
+  caffe_cpu_scale<TypeParam>(n, alpha, this->blob_bottom_->cpu_data(),
+                             this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(scaled[i], x[i] * alpha);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestScaleGPU){
+  int n = this->blob_bottom_->count();
+  TypeParam alpha = this->blob_bottom_->cpu_diff()[rand() %
+                                               this->blob_bottom_->count()];
+  caffe_gpu_scale<TypeParam>(n, alpha, this->blob_bottom_->gpu_data(),
+                             this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* scaled = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(scaled[i], x[i] * alpha);
+  }
+}
+
 }
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 50e5662e207..b822e076937 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -339,4 +339,32 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
 
+template <>
+void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+  cblas_dcopy(n, x, 1, y, 1);
+  cblas_dscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+  CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+}
+
+template <>
+void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+}
+
 }  // namespace caffe

From c9d905618cc435b23e06c1cf667726ccc6a68a00 Mon Sep 17 00:00:00 2001
From: Kai Li <kaili_kloud@163.com>
Date: Wed, 26 Feb 2014 11:23:20 +0800
Subject: [PATCH 8/8] Add signbit math func, simplify GPU defs & instantiations
 with a macro

---
 include/caffe/util/math_functions.hpp  | 31 ++++++++++++++++--
 src/caffe/test/test_math_functions.cpp | 21 ++++++++++++
 src/caffe/util/math_functions.cpp      |  1 +
 src/caffe/util/math_functions.cu       | 44 +++-----------------------
 4 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 222e29a0134..003d07cd4d3 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -5,6 +5,7 @@
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
 #include <cmath> // for std::fabs
+#include <math.h> // for signbit
 #include <mkl.h>
 #include <cublas_v2.h>
 
@@ -122,9 +123,6 @@ inline char caffe_sign(Dtype val) {
 // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
 //   in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp.
 // Please refer to commit 7e8ef25c7 of the boost-eigen branch.
-// Git cherry picking that commit caused a conflict hard to resolve and
-//   copying that file in convenient for code reviewing.
-// So they have to be pasted here temporarily.
 #define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \
   template<typename Dtype> \
   void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \
@@ -140,11 +138,38 @@ inline char caffe_sign(Dtype val) {
   template <> \
   void caffe_cpu_##name<double>(const int n, const double* x, double* y)
 
+
+#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
+template<typename Dtype> \
+__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
+  int index = threadIdx.x + blockIdx.x * blockDim.x; \
+  if (index < n) { \
+    operation; \
+  } \
+} \
+template <> \
+void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
+  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+} \
+template <> \
+void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
+  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+}
+
+// output is 1 for the positives, 0 for zero, and -1 for the negatives
 DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 
 template<typename Dtype>
 void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
+// returns a nonzero value is the input has its sign bit set.
+DEFINE_CAFFE_CPU_UNARY_FUNC(signbit, y[i] = std::signbit(x[i]));
+
+template<typename Dtype>
+void caffe_gpu_signbit(const int n, const Dtype* x, Dtype* y);
+
 DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 
 template <typename Dtype>
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index 00f28badb86..d314d73b45c 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -119,6 +119,27 @@ TYPED_TEST(MathFunctionsTest, TestSignGPU){
   }
 }
 
+TYPED_TEST(MathFunctionsTest, TestSignbitCPU){
+  int n = this->blob_bottom_->count();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  caffe_cpu_signbit<TypeParam>(n, x, this->blob_bottom_->mutable_cpu_diff());
+  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signbits[i], x[i] < 0 ? 1 : 0);
+  }
+}
+
+TYPED_TEST(MathFunctionsTest, TestSignbitGPU){
+  int n = this->blob_bottom_->count();
+  caffe_gpu_signbit<TypeParam>(n, this->blob_bottom_->gpu_data(),
+                            this->blob_bottom_->mutable_gpu_diff());
+  const TypeParam* signbits = this->blob_bottom_->cpu_diff();
+  const TypeParam* x = this->blob_bottom_->cpu_data();
+  for (int i = 0; i < n; ++i) {
+    CHECK_EQ(signbits[i], x[i] < 0 ? 1 : 0);
+  }
+}
+
 TYPED_TEST(MathFunctionsTest, TestFabsCPU){
   int n = this->blob_bottom_->count();
   const TypeParam* x = this->blob_bottom_->cpu_data();
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index b822e076937..8a2f25e0c34 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -337,6 +337,7 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 }
 
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(signbit);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
 
 template <>
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 8ecd72c49d0..3ca5fea9a64 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -4,7 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
-#include <math_functions.h> // CUDA's, not caffe's, for fabs
+#include <math_functions.h> // CUDA's, not caffe's, for fabs, signbit
 
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -36,44 +36,8 @@ void caffe_gpu_mul<double>(const int N, const double* a,
       N, a, b, y);
 }
 
-template<typename Dtype>
-__global__ void sign_kernel(const int n, const Dtype* x, Dtype* y) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < n) {
-    y[index] = (Dtype(0) < x[index]) - (x[index] < Dtype(0));
-  }
-}
-
-template <>
-void caffe_gpu_sign<float>(const int n, const float* x, float* y) {
-  sign_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y);
-}
-
-template <>
-void caffe_gpu_sign<double>(const int n, const double* x, double* y) {
-  sign_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y);
-}
-
-template<typename Dtype>
-__global__ void fabs_kernel(const int n, const Dtype* x, Dtype* y) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < n) {
-    y[index] = fabs(x[index]);
-  }
-}
-
-template <>
-void caffe_gpu_fabs<float>(const int n, const float* x, float* y) {
-  fabs_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y);
-}
-
-template <>
-void caffe_gpu_fabs<double>(const int n, const double* x, double* y) {
-  fabs_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y);
-}
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - (x[index] < Dtype(0)));
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(signbit, y[index] = signbit(x[index]));
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(fabs, y[index] = fabs(x[index]));
 
 }  // namespace caffe