handle fallback correctly for write inplace when the array is MKLDNN. (…

…apache#10651) * handle writeinplace correctly for mkldnn arrays. * Add unit tests. * Fix a bug in mkldnn copy. * Fix a bug in ndarray copy. * Verify results.
zheng-da · Jun 28, 2018 · 8d698dc · 8d698dc
1 parent 1bb6d7f
commit 8d698dc
Show file tree

Hide file tree

Showing 6 changed files with 217 additions and 17 deletions.
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
@@ -78,8 +78,8 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
 }
 
 inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
-                                 const std::vector<OpReqType> &req,
                                  const std::vector<NDArray> *bufs,
+                                 std::vector<OpReqType> *req,
                                  std::vector<TBlob> *blobs,
                                  std::vector<NDArray> *temp_src,
                                  std::vector<NDArray> *temp_dst) {
@@ -88,6 +88,12 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     auto& nd = src[i];
     bool is_default = nd.storage_type() == kDefaultStorage;
 #if MXNET_USE_MKLDNN == 1
+    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+      // If it's write inplace and the output array doesn't use the default
+      // layout, we'll generate a temporary output array below, which means
+      // the input array and the output array are no longer the same array.
+      // we should change the request type.
+      req->at(i) = kWriteTo;
     // We have to make sure it's default storage and default layout.
     is_default = nd.IsDefaultData();
 #endif
@@ -117,9 +123,9 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
  */
 inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    const std::vector<NDArray> &ndoutputs,
-                                   const std::vector<OpReqType> &req,
                                    const std::vector<NDArray> *in_bufs,
                                    const std::vector<NDArray> *out_bufs,
+                                   std::vector<OpReqType> *req,
                                    std::vector<TBlob> *input_blobs,
                                    std::vector<TBlob> *output_blobs,
                                    std::vector<NDArray> *pre_temp_src,
@@ -132,7 +138,7 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
   SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
                       in_temp_idx_map);
   // populate output blobs
-  SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
+  SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst,
                        post_temp_src);
   // add mutable inputs to post temp list
   for (const auto idx : mutate_idx) {

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
@@ -78,7 +78,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
     pre_temp_src_.clear(); pre_temp_dst_.clear();
     post_temp_src_.clear(); post_temp_dst_.clear();
     in_temp_idx_map_.clear();
-    SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
+    tmp_req = req;
+    SetupDefaultBlobsInOut(in_array, out_array, &pre_temp_buf_, &post_temp_buf_, &req,
                            &in_data_, &out_data_,
                            &pre_temp_src_, &pre_temp_dst_,
                            &post_temp_src_, &post_temp_dst_,
@@ -89,8 +90,12 @@ class StorageFallbackOpExecutor : public OpExecutor {
   // storage fallback after fcompute is completed
   void PostFCompute(bool is_gpu) {
     common::CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx, is_gpu);
+    req = tmp_req;
   }
 
+  // output requirement on each output array.
+  // This temporarily saves the original output requirements.
+  std::vector<OpReqType> tmp_req;
   // default storage tensor blobs for fcompute
   std::vector<TBlob> in_data_, out_data_;
   // These are NDArray buffers for cast storage.

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
@@ -373,16 +373,17 @@ inline void PushFCompute(const FCompute& fn,
 #if MXNET_USE_MKLDNN == 1
       InvalidateOutputs(outputs, req);
 #endif
+      std::vector<OpReqType> tmp_req = req;
       // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+      SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
                              &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                              &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
       OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
       // pre-fcompute fallback, cast to default storage type
       CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-      fn(attrs, opctx, input_blobs, req, output_blobs);
+      fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
       // post-fcompute fallback, cast to original storage type
       CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
       if (is_gpu) {
@@ -492,15 +493,16 @@ inline void PushOperator(const OpStatePtr& state,
 #if MXNET_USE_MKLDNN == 1
         InvalidateOutputs(outputs, req);
 #endif
+        std::vector<OpReqType> tmp_req = req;
         // populate input blobs and output blobs
-        SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+        SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
                                &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                                &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
         // setup contexts
         bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
         // pre-fcompute fallback
         CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-        fcompute(state, opctx, input_blobs, req, output_blobs);
+        fcompute(state, opctx, input_blobs, tmp_req, output_blobs);
         // post-fcompute fallback, cast to original storage type, if necessary
         CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
         if (is_gpu && exec_type == ExecType::kSync) {

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
@@ -1118,9 +1118,8 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
                              to_mem->get_primitive_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
-      std::vector<mkldnn::primitive> net;
-      net.push_back(mkldnn::reorder(*from_mem, *to_mem));
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+      const_cast<NDArray &>(to).CopyFrom(*from_mem);
+      MKLDNNStream::Get()->Submit();
     }
   } else {
     // In this case, one of the NDArray isn't supported by MKLDNN, we need

diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -35,7 +35,13 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                 const NDArray &in_data, const OpReqType &req,
                 const NDArray &out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[0]);
-  auto in_mem = in_data.GetMKLDNNData();
+
+  // If the input data is a view of an MKLDNN array, we should create a new
+  // NDArray with reordered data.
+  NDArray data = in_data;
+  if (data.IsMKLDNNData() && data.IsView())
+    data = data.Reorder2Default();
+  auto in_mem = data.GetMKLDNNData();
   if (req == kAddTo) {
     TmpMemMgr::Get()->Init(ctx.requested[0]);
     // We should try and force the output memory has the same format

diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
@@ -26,6 +26,7 @@
 #if MXNET_USE_MKLDNN == 1
 
 #include "gtest/gtest.h"
+#include "mxnet/imperative.h"
 #include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
 
 using namespace mxnet;
@@ -97,12 +98,18 @@ static void InitArray(NDArray *arr) {
 }
 
 // Init arrays with the specified layout.
-static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd) {
+static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd,
+                            bool is_rand = false) {
   const TBlob &blob = arr->data();
   mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
   size_t size = blob.Size();
-  for (size_t i = 0; i < size; i++)
-    data[i] = i;
+  if (is_rand) {
+    for (size_t i = 0; i < size; i++)
+      data[i] = std::rand();
+  } else {
+    for (size_t i = 0; i < size; i++)
+      data[i] = i;
+  }
   arr->MKLDNNDataReorderAsync(pd);
   arr->WaitToRead();
 }
@@ -206,7 +213,7 @@ static std::vector<mkldnn::memory::format> GetMKLDNNFormat(size_t num_dims, int
 }
 
 struct TestArrayShapes {
-  std::vector<TShape> shapes;
+  std::vector<nnvm::TShape> shapes;
   std::vector<mkldnn::memory::primitive_desc> pds;
 };
 
@@ -239,7 +246,7 @@ static TestArrayShapes GetTestArrayShapes() {
   {
     // 4D
     TShape s1(4);
-    s1[0] = 1; s1[1] = 96; s1[2] = 54; s1[3] = 54;
+    s1[0] = 10; s1[1] = 96; s1[2] = 54; s1[3] = 54;
     shapes.push_back(s1);
     pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));
 
@@ -332,4 +339,179 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
   }
 }
 
+struct OpAttrs {
+  nnvm::NodeAttrs attrs;
+  std::vector<DispatchMode> dispatches;
+};
+
+OpAttrs GetCopyOp() {
+  OpAttrs attrs;
+  attrs.attrs.op = Op::Get("_copy");
+  attrs.dispatches.resize(2);
+  attrs.dispatches[0] = DispatchMode::kFCompute;
+  attrs.dispatches[1] = DispatchMode::kFComputeEx;
+  return attrs;
+}
+
+OpAttrs GetLeakyReluOp() {
+  OpAttrs attrs;
+  attrs.attrs.op = Op::Get("LeakyReLU");
+  attrs.dispatches.resize(1);
+  attrs.dispatches[0] = DispatchMode::kFCompute;
+  return attrs;
+}
+
+/*
+ * We want to get a few types of NDArrays for testing:
+ * 1. Normal NDArray
+ * 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
+ * 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
+ *    dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
+ *    type of NDArrays only exists for weight arrays. I don't think we should
+ *    pass them to all operators.
+ *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    reordered to 5 dimensions.
+ * 4. Reshaped/sliced NDArray
+ * 5. Reshaped/sliced NDArray with MKLDNN layout (reshape/slice from Normal NDArray
+ *    with MKLDNN layout)
+ * 6. Reshaped/sliced NDArray with MKLDNN layout whose MKLDNN memory may have
+ *    different dimensions from the NDArray (result of MKLDNNDataReorderAsync).
+ *    However, this type of NDArrays only exists for weight arrays. I don't think
+ *    we should pass them to all operators.
+ *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    reordered to 5 dimensions.
+ *
+ */
+std::vector<NDArray> GetTestInputArrays() {
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<nnvm::TShape> shapes = tas.shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  std::vector<NDArray> in_arrs;
+  for (auto shape : shapes) {
+    in_arrs.emplace_back(shape, Context());
+    InitArray(&in_arrs.back());
+    for (auto pd : pds) {
+      if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
+        continue;
+
+      in_arrs.emplace_back(shape, Context());
+      InitMKLDNNArray(&in_arrs.back(), pd);
+
+      // Get a sliced version.
+      NDArray arr(shape, Context());
+      InitMKLDNNArray(&arr, pd);
+      arr = arr.Slice(1, arr.shape()[0] - 1);
+      in_arrs.emplace_back(arr);
+    }
+  }
+  return in_arrs;
+}
+
+/*
+ * We want to get a few types of NDArrays for testing:
+ * 1. Normal NDArray
+ * 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
+ * 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
+ *    dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
+ *    type of NDArrays only exists for weight arrays. I don't think we should
+ *    pass them to all operators.
+ *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    reordered to 5 dimensions.
+ * 4. Reused NDArray (this is created by the MXNet executor). This type of
+ *    NDArrays can only be used as output arrays.
+ */
+std::vector<NDArray> GetTestOutputArrays(const TShape &shape,
+                                         const std::vector<mkldnn::memory::primitive_desc> &pds) {
+  std::vector<NDArray> in_arrs;
+  in_arrs.emplace_back(shape, Context());
+  InitArray(&in_arrs.back());
+
+  // Get a reused version.
+  nnvm::TShape s(1);
+  s[0] = shape.Size();
+  NDArray arr(s, Context());
+  arr = arr.AsArray(shape, arr.dtype());
+  InitArray(&arr);
+  in_arrs.emplace_back(arr);
+
+  for (auto pd : pds) {
+    if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
+      continue;
+
+    in_arrs.emplace_back(shape, Context());
+    InitMKLDNNArray(&in_arrs.back(), pd, true);
+
+    // Get a reused version.
+    nnvm::TShape s(1);
+    s[0] = shape.Size();
+    arr = NDArray(s, Context());
+    arr = arr.AsArray(shape, arr.dtype());
+    InitMKLDNNArray(&arr, pd, true);
+    in_arrs.emplace_back(arr);
+  }
+  return in_arrs;
+}
+
+using VerifyFunc = std::function<void (const NDArray &in_arr, const NDArray &arr)>;
+
+void VerifyCopyResult(const NDArray &in_arr, const NDArray &arr) {
+  NDArray tmp1 = in_arr.Reorder2Default();
+  NDArray tmp2 = arr.Reorder2Default();
+  EXPECT_EQ(tmp1.shape().Size(), tmp2.shape().Size());
+  TBlob d1 = tmp1.data();
+  TBlob d2 = tmp2.data();
+  EXPECT_EQ(memcmp(d1.dptr_, d2.dptr_,
+                   tmp1.shape().Size() * sizeof(mshadow::default_real_t)), 0);
+}
+
+void TestUnaryOp(const OpAttrs &attrs, VerifyFunc verify_fn) {
+  std::vector<NDArray*> inputs(1);
+  std::vector<NDArray*> outputs(1);
+  std::vector<OpReqType> req(1);
+  std::vector<DispatchMode> dispatches = attrs.dispatches;
+
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  std::vector<NDArray> in_arrs = GetTestInputArrays();
+  for (auto in_arr : in_arrs) {
+    for (auto dispatch : dispatches) {
+      std::vector<NDArray> out_arrs = GetTestOutputArrays(in_arr.shape(), pds);
+      for (auto out_arr : out_arrs) {
+        req[0] = kWriteTo;
+        inputs[0] = &in_arr;
+        outputs[0] = &out_arr;
+        Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs,
+                                    outputs, req, dispatch, mxnet::OpStatePtr());
+        out_arr.WaitToRead();
+        verify_fn(in_arr, out_arr);
+      }
+    }
+  }
+
+  for (auto dispatch : dispatches) {
+    in_arrs = GetTestInputArrays();
+    for (auto arr : in_arrs) {
+      // If the array is a view, we shouldn't write data to it.
+      if (arr.IsView())
+        continue;
+
+      NDArray orig = arr.Copy(arr.ctx());
+      req[0] = kWriteInplace;
+      inputs[0] = &arr;
+      outputs[0] = &arr;
+      Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs, outputs, req,
+                                  dispatch, mxnet::OpStatePtr());
+      arr.WaitToRead();
+      verify_fn(orig, arr);
+    }
+  }
+}
+
+TEST(IMPERATIVE, UnaryOp) {
+  OpAttrs attrs = GetCopyOp();
+  TestUnaryOp(attrs, VerifyCopyResult);
+}
+
 #endif