diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index fc2b17c2e17..3b09459bbf1 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -313,6 +313,32 @@ class ConvolutionLayer : public Layer<Dtype> {
   int N_;
 };
 
+template <typename Dtype>
+class ConcatLayer : public Layer<Dtype> {
+ public:
+  explicit ConcatLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  Blob<Dtype> col_bob_;
+
+  int COUNT_;
+  int NUM_;
+  int CHANNELS_;
+  int HEIGHT_;
+  int WIDTH_;
+  int concat_dim_;
+};
 
 // This function is used to create a pthread that prefetches the data.
 template <typename Dtype>
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index d7798ea9053..1f79a74636b 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -25,6 +25,8 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
     return new BNLLLayer<Dtype>(param);
   } else if (type == "conv") {
     return new ConvolutionLayer<Dtype>(param);
+  } else if (type == "concat") {
+    return new ConcatLayer<Dtype>(param);
   } else if (type == "data") {
     return new DataLayer<Dtype>(param);
   } else if (type == "hdf5_data") {
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
new file mode 100644
index 00000000000..dc949c14010
--- /dev/null
+++ b/src/caffe/layers/concat_layer.cpp
@@ -0,0 +1,108 @@
+// Copyright 2014 Sergio Guadarrama
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ConcatLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  CHECK_GT(bottom.size(), 1) <<
+    "Concat Layer takes at least two blobs as input.";
+  CHECK_EQ(top->size(), 1) <<
+    "Concat Layer takes a single blob as output.";
+  concat_dim_ = this->layer_param_.concat_dim();
+  CHECK_GE(concat_dim_, 0) << "concat_dim should be >= 0";
+  CHECK_LE(concat_dim_, 1) <<
+    "For now concat_dim <=1, it can only concat num and channels";
+  // Intialize with the first blob
+  COUNT_ = bottom[0]->count();
+  NUM_ = bottom[0]->num();
+  CHANNELS_ = bottom[0]->channels();
+  HEIGHT_ = bottom[0]->height();
+  WIDTH_ = bottom[0]->width();
+  for (int i = 1; i < bottom.size(); ++i) {
+    COUNT_ += bottom[i]->count();
+    if (concat_dim_== 0) {
+      NUM_ += bottom[i]->num();
+    } else if (concat_dim_ == 1) {
+      CHANNELS_ += bottom[i]->channels();
+    } else if (concat_dim_ == 2) {
+      HEIGHT_ += bottom[i]->height();
+    } else if (concat_dim_ == 3) {
+      WIDTH_ += bottom[i]->width();
+    }
+  }
+  (*top)[0]->Reshape(NUM_, CHANNELS_, HEIGHT_, WIDTH_);
+  CHECK_EQ(COUNT_, (*top)[0]->count());
+}
+
+template <typename Dtype>
+void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  if (concat_dim_== 0) {
+    int offset_num = 0;
+    for (int i = 0; i < bottom.size(); ++i) {
+      const Dtype* bottom_data = bottom[i]->cpu_data();
+      int num_elem = bottom[i]->count();
+      caffe_copy(num_elem, bottom_data, top_data+(*top)[0]->offset(offset_num));
+      offset_num += bottom[i]->num();
+    }
+  } else if (concat_dim_ == 1) {
+    int offset_channel = 0;
+    for (int i = 0; i < bottom.size(); ++i) {
+      const Dtype* bottom_data = bottom[i]->cpu_data();
+      int num_elem =
+        bottom[i]->channels()*bottom[i]->height()*bottom[i]->width();
+      for (int n = 0; n < NUM_; ++n) {
+        caffe_copy(num_elem, bottom_data+bottom[i]->offset(n),
+          top_data+(*top)[0]->offset(n, offset_channel));
+      }
+      offset_channel += bottom[i]->channels();
+    }
+  } else {
+    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
+      " not implemented yet";
+  }
+}
+
+template <typename Dtype>
+Dtype ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  if (concat_dim_ == 0) {
+    int offset_num = 0;
+    for (int i = 0; i < bottom->size(); ++i) {
+      Blob<Dtype>* blob = (*bottom)[i];
+      Dtype* bottom_diff = blob->mutable_cpu_diff();
+      caffe_copy(blob->count(),
+        top_diff+top[0]->offset(offset_num), bottom_diff);
+      offset_num += blob->num();
+    }
+  } else if (concat_dim_ == 1) {
+    int offset_channel = 0;
+    for (int i = 0; i < bottom->size(); ++i) {
+      Blob<Dtype>* blob = (*bottom)[i];
+      Dtype* bottom_diff = blob->mutable_cpu_diff();
+      int num_elem = blob->channels()*blob->height()*blob->width();
+      for (int n = 0; n < NUM_; ++n) {
+        caffe_copy(num_elem, top_diff+top[0]->offset(n, offset_channel),
+          bottom_diff+blob->offset(n));
+      }
+      offset_channel += blob->channels();
+    }
+  } else {
+    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
+      " not implemented yet";
+  }
+  return Dtype(0.);
+}
+
+INSTANTIATE_CLASS(ConcatLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
new file mode 100644
index 00000000000..616a5e61683
--- /dev/null
+++ b/src/caffe/layers/concat_layer.cu
@@ -0,0 +1,75 @@
+// Copyright 2014 Sergio Guadarrama
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  if (concat_dim_ == 0) {
+    int offset_num = 0;
+    for (int i = 0; i < bottom.size(); ++i) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      caffe_gpu_copy(bottom[i]->count(), bottom_data,
+        top_data+(*top)[0]->offset(offset_num));
+      offset_num += bottom[i]->num();
+    }
+  } else if (concat_dim_ == 1) {
+    int offset_channel = 0;
+    for (int i = 0; i < bottom.size(); ++i) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      int num_elem =
+        bottom[i]->channels()*bottom[i]->height()*bottom[i]->width();
+      for (int n = 0; n < NUM_; ++n) {
+        caffe_gpu_copy(num_elem, bottom_data+bottom[i]->offset(n),
+          top_data+(*top)[0]->offset(n, offset_channel));
+      }
+      offset_channel += bottom[i]->channels();
+    }
+  } else {
+    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
+      " not implemented yet";
+  }
+}
+
+template <typename Dtype>
+Dtype ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  if (concat_dim_ == 0) {
+    int offset_num = 0;
+    for (int i = 0; i < bottom->size(); ++i) {
+      Blob<Dtype>* blob = (*bottom)[i];
+      Dtype* bottom_diff = blob->mutable_gpu_diff();
+      caffe_gpu_copy(blob->count(),
+        top_diff+top[0]->offset(offset_num), bottom_diff);
+      offset_num += blob->num();
+    }
+  } else if (concat_dim_ == 1) {
+    int offset_channel = 0;
+    for (int i = 0; i < bottom->size(); ++i) {
+      Blob<Dtype>* blob = (*bottom)[i];
+      Dtype* bottom_diff = blob->mutable_gpu_diff();
+      int num_elem = blob->channels()*blob->height()*blob->width();
+      for (int n = 0; n < NUM_; ++n) {
+        caffe_gpu_copy(num_elem, top_diff+top[0]->offset(n, offset_channel),
+          bottom_diff+blob->offset(n));
+      }
+      offset_channel += blob->channels();
+    }
+  } else {
+    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
+      " not implemented yet";
+  }
+  return Dtype(0.);
+}
+
+INSTANTIATE_CLASS(ConcatLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 4da8e8d3522..06a7cd7173e 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -91,6 +91,11 @@ message LayerParameter {
   // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
   // be larger than the number of keys in the leveldb.
   optional uint32 rand_skip = 53 [ default = 0 ];
+
+  // Concat Layer need to specify the dimension along the concat will happen, 
+  // the other dimensions must be the same for all the bottom blobs
+  // By default it will concatenate blobs along channels dimension
+  optional uint32 concat_dim = 65 [ default = 1 ]; 
 }
 
 message LayerConnection {
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
new file mode 100644
index 00000000000..3515ef96592
--- /dev/null
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -0,0 +1,130 @@
+// Copyright 2014 Sergio Guadarrama
+
+#include <cstring>
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template <typename Dtype>
+class ConcatLayerTest : public ::testing::Test {
+ protected:
+  ConcatLayerTest()
+      : blob_bottom_0(new Blob<Dtype>(2, 3, 6, 5)),
+        blob_bottom_1(new Blob<Dtype>(2, 5, 6, 5)),
+        blob_bottom_2(new Blob<Dtype>(5, 3, 6, 5)),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    ConstantFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_0);
+    filler_param.set_value(2.);
+    filler.Fill(this->blob_bottom_1);
+    filler_param.set_value(3.);
+    filler.Fill(this->blob_bottom_2);
+    blob_bottom_vec_0.push_back(blob_bottom_0);
+    blob_bottom_vec_0.push_back(blob_bottom_1);
+    blob_bottom_vec_1.push_back(blob_bottom_0);
+    blob_bottom_vec_1.push_back(blob_bottom_2);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~ConcatLayerTest() {
+    delete blob_bottom_0; delete blob_bottom_1;
+    delete blob_bottom_2; delete blob_top_;
+  }
+
+  Blob<Dtype>* const blob_bottom_0;
+  Blob<Dtype>* const blob_bottom_1;
+  Blob<Dtype>* const blob_bottom_2;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_0, blob_bottom_vec_1;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(ConcatLayerTest, Dtypes);
+
+TYPED_TEST(ConcatLayerTest, TestSetupNum) {
+  LayerParameter layer_param;
+  layer_param.set_concat_dim(0);
+  ConcatLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_1, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(),
+    this->blob_bottom_0->num() + this->blob_bottom_2->num());
+  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_0->channels());
+  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0->height());
+  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0->width());
+}
+
+TYPED_TEST(ConcatLayerTest, TestSetupChannels) {
+  LayerParameter layer_param;
+  ConcatLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_0, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_0->num());
+  EXPECT_EQ(this->blob_top_->channels(),
+    this->blob_bottom_0->channels()+this->blob_bottom_1->channels());
+  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0->height());
+  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0->width());
+}
+
+
+TYPED_TEST(ConcatLayerTest, TestCPUNum) {
+  LayerParameter layer_param;
+  ConcatLayer<TypeParam> layer(layer_param);
+  Caffe::set_mode(Caffe::CPU);
+  layer.SetUp(this->blob_bottom_vec_0, &(this->blob_top_vec_));
+  layer.Forward(this->blob_bottom_vec_0, &(this->blob_top_vec_));
+  for (int n = 0; n < this->blob_top_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_0->channels(); ++c) {
+      for (int h = 0; h < this->blob_top_->height(); ++h) {
+        for (int w = 0; w < this->blob_top_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
+            this->blob_bottom_vec_0[0]->data_at(n, c, h, w));
+        }
+      }
+    }
+    for (int c = 0; c < this->blob_bottom_1->channels(); ++c) {
+      for (int h = 0; h < this->blob_top_->height(); ++h) {
+        for (int w = 0; w < this->blob_top_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n, c+3, h, w),
+            this->blob_bottom_vec_0[1]->data_at(n, c, h, w));
+        }
+      }
+    }
+  }
+}
+
+
+TYPED_TEST(ConcatLayerTest, TestCPUGradient) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
+  ConcatLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradient(&layer, &(this->blob_bottom_vec_0),
+    &(this->blob_top_vec_));
+}
+
+TYPED_TEST(ConcatLayerTest, TestGPUGradient) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::GPU);
+  ConcatLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradient(&layer, &(this->blob_bottom_vec_0),
+    &(this->blob_top_vec_));
+}
+
+}  // namespace caffe