Skip to content

Commit

Permalink
handle fallback correctly for write inplace when the array is MKLDNN. (
Browse files Browse the repository at this point in the history
…apache#10651)

* handle writeinplace correctly for mkldnn arrays.

* Add unit tests.

* Fix a bug in mkldnn copy.

* Fix a bug in ndarray copy.

* Verify results.
  • Loading branch information
zheng-da committed Jun 28, 2018
1 parent 1bb6d7f commit 8d698dc
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 17 deletions.
12 changes: 9 additions & 3 deletions src/common/exec_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
}

inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
const std::vector<OpReqType> &req,
const std::vector<NDArray> *bufs,
std::vector<OpReqType> *req,
std::vector<TBlob> *blobs,
std::vector<NDArray> *temp_src,
std::vector<NDArray> *temp_dst) {
Expand All @@ -88,6 +88,12 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
auto& nd = src[i];
bool is_default = nd.storage_type() == kDefaultStorage;
#if MXNET_USE_MKLDNN == 1
if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
// If it's write inplace and the output array doesn't use the default
// layout, we'll generate a temporary output array below, which means
// the input array and the output array are no longer the same array.
// we should change the request type.
req->at(i) = kWriteTo;
// We have to make sure it's default storage and default layout.
is_default = nd.IsDefaultData();
#endif
Expand Down Expand Up @@ -117,9 +123,9 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
*/
inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
const std::vector<NDArray> &ndoutputs,
const std::vector<OpReqType> &req,
const std::vector<NDArray> *in_bufs,
const std::vector<NDArray> *out_bufs,
std::vector<OpReqType> *req,
std::vector<TBlob> *input_blobs,
std::vector<TBlob> *output_blobs,
std::vector<NDArray> *pre_temp_src,
Expand All @@ -132,7 +138,7 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
in_temp_idx_map);
// populate output blobs
SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst,
post_temp_src);
// add mutable inputs to post temp list
for (const auto idx : mutate_idx) {
Expand Down
7 changes: 6 additions & 1 deletion src/executor/attach_op_execs_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
pre_temp_src_.clear(); pre_temp_dst_.clear();
post_temp_src_.clear(); post_temp_dst_.clear();
in_temp_idx_map_.clear();
SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
tmp_req = req;
SetupDefaultBlobsInOut(in_array, out_array, &pre_temp_buf_, &post_temp_buf_, &req,
&in_data_, &out_data_,
&pre_temp_src_, &pre_temp_dst_,
&post_temp_src_, &post_temp_dst_,
Expand All @@ -89,8 +90,12 @@ class StorageFallbackOpExecutor : public OpExecutor {
// storage fallback after fcompute is completed
void PostFCompute(bool is_gpu) {
common::CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx, is_gpu);
req = tmp_req;
}

// output requirement on each output array.
// This temporarily saves the original output requirements.
std::vector<OpReqType> tmp_req;
// default storage tensor blobs for fcompute
std::vector<TBlob> in_data_, out_data_;
// These are NDArray buffers for cast storage.
Expand Down
10 changes: 6 additions & 4 deletions src/imperative/imperative_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,16 +373,17 @@ inline void PushFCompute(const FCompute& fn,
#if MXNET_USE_MKLDNN == 1
InvalidateOutputs(outputs, req);
#endif
std::vector<OpReqType> tmp_req = req;
// setup blobs
SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
&input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
&post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
// setup context
OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
// pre-fcompute fallback, cast to default storage type
CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
fn(attrs, opctx, input_blobs, req, output_blobs);
fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
// post-fcompute fallback, cast to original storage type
CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
if (is_gpu) {
Expand Down Expand Up @@ -492,15 +493,16 @@ inline void PushOperator(const OpStatePtr& state,
#if MXNET_USE_MKLDNN == 1
InvalidateOutputs(outputs, req);
#endif
std::vector<OpReqType> tmp_req = req;
// populate input blobs and output blobs
SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
&input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
&post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
// setup contexts
bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
// pre-fcompute fallback
CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
fcompute(state, opctx, input_blobs, req, output_blobs);
fcompute(state, opctx, input_blobs, tmp_req, output_blobs);
// post-fcompute fallback, cast to original storage type, if necessary
CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
if (is_gpu && exec_type == ExecType::kSync) {
Expand Down
5 changes: 2 additions & 3 deletions src/ndarray/ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1118,9 +1118,8 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
to_mem->get_primitive_desc().get_size());
memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
} else {
std::vector<mkldnn::primitive> net;
net.push_back(mkldnn::reorder(*from_mem, *to_mem));
mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
const_cast<NDArray &>(to).CopyFrom(*from_mem);
MKLDNNStream::Get()->Submit();
}
} else {
// In this case, one of the NDArray isn't supported by MKLDNN, we need
Expand Down
8 changes: 7 additions & 1 deletion src/operator/nn/mkldnn/mkldnn_copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
const NDArray &in_data, const OpReqType &req,
const NDArray &out_data) {
TmpMemMgr::Get()->Init(ctx.requested[0]);
auto in_mem = in_data.GetMKLDNNData();

// If the input data is a view of an MKLDNN array, we should create a new
// NDArray with reordered data.
NDArray data = in_data;
if (data.IsMKLDNNData() && data.IsView())
data = data.Reorder2Default();
auto in_mem = data.GetMKLDNNData();
if (req == kAddTo) {
TmpMemMgr::Get()->Init(ctx.requested[0]);
// We should try and force the output memory has the same format
Expand Down
192 changes: 187 additions & 5 deletions tests/cpp/operator/mkldnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#if MXNET_USE_MKLDNN == 1

#include "gtest/gtest.h"
#include "mxnet/imperative.h"
#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"

using namespace mxnet;
Expand Down Expand Up @@ -97,12 +98,18 @@ static void InitArray(NDArray *arr) {
}

// Init arrays with the specified layout.
static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd) {
static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd,
bool is_rand = false) {
const TBlob &blob = arr->data();
mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
size_t size = blob.Size();
for (size_t i = 0; i < size; i++)
data[i] = i;
if (is_rand) {
for (size_t i = 0; i < size; i++)
data[i] = std::rand();
} else {
for (size_t i = 0; i < size; i++)
data[i] = i;
}
arr->MKLDNNDataReorderAsync(pd);
arr->WaitToRead();
}
Expand Down Expand Up @@ -206,7 +213,7 @@ static std::vector<mkldnn::memory::format> GetMKLDNNFormat(size_t num_dims, int
}

struct TestArrayShapes {
std::vector<TShape> shapes;
std::vector<nnvm::TShape> shapes;
std::vector<mkldnn::memory::primitive_desc> pds;
};

Expand Down Expand Up @@ -239,7 +246,7 @@ static TestArrayShapes GetTestArrayShapes() {
{
// 4D
TShape s1(4);
s1[0] = 1; s1[1] = 96; s1[2] = 54; s1[3] = 54;
s1[0] = 10; s1[1] = 96; s1[2] = 54; s1[3] = 54;
shapes.push_back(s1);
pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));

Expand Down Expand Up @@ -332,4 +339,179 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
}
}

struct OpAttrs {
nnvm::NodeAttrs attrs;
std::vector<DispatchMode> dispatches;
};

OpAttrs GetCopyOp() {
OpAttrs attrs;
attrs.attrs.op = Op::Get("_copy");
attrs.dispatches.resize(2);
attrs.dispatches[0] = DispatchMode::kFCompute;
attrs.dispatches[1] = DispatchMode::kFComputeEx;
return attrs;
}

OpAttrs GetLeakyReluOp() {
OpAttrs attrs;
attrs.attrs.op = Op::Get("LeakyReLU");
attrs.dispatches.resize(1);
attrs.dispatches[0] = DispatchMode::kFCompute;
return attrs;
}

/*
* We want to get a few types of NDArrays for testing:
* 1. Normal NDArray
* 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
* 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
* dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
* type of NDArrays only exists for weight arrays. I don't think we should
* pass them to all operators.
* In the inference mode, the MKLDNN memory in the weight array will be
* reordered to 5 dimensions.
* 4. Reshaped/sliced NDArray
* 5. Reshaped/sliced NDArray with MKLDNN layout (reshape/slice from Normal NDArray
* with MKLDNN layout)
* 6. Reshaped/sliced NDArray with MKLDNN layout whose MKLDNN memory may have
* different dimensions from the NDArray (result of MKLDNNDataReorderAsync).
* However, this type of NDArrays only exists for weight arrays. I don't think
* we should pass them to all operators.
* In the inference mode, the MKLDNN memory in the weight array will be
* reordered to 5 dimensions.
*
*/
std::vector<NDArray> GetTestInputArrays() {
TestArrayShapes tas = GetTestArrayShapes();
std::vector<nnvm::TShape> shapes = tas.shapes;
std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;

std::vector<NDArray> in_arrs;
for (auto shape : shapes) {
in_arrs.emplace_back(shape, Context());
InitArray(&in_arrs.back());
for (auto pd : pds) {
if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
continue;

in_arrs.emplace_back(shape, Context());
InitMKLDNNArray(&in_arrs.back(), pd);

// Get a sliced version.
NDArray arr(shape, Context());
InitMKLDNNArray(&arr, pd);
arr = arr.Slice(1, arr.shape()[0] - 1);
in_arrs.emplace_back(arr);
}
}
return in_arrs;
}

/*
* We want to get a few types of NDArrays for testing:
* 1. Normal NDArray
* 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
* 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
* dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
* type of NDArrays only exists for weight arrays. I don't think we should
* pass them to all operators.
* In the inference mode, the MKLDNN memory in the weight array will be
* reordered to 5 dimensions.
* 4. Reused NDArray (this is created by the MXNet executor). This type of
* NDArrays can only be used as output arrays.
*/
std::vector<NDArray> GetTestOutputArrays(const TShape &shape,
const std::vector<mkldnn::memory::primitive_desc> &pds) {
std::vector<NDArray> in_arrs;
in_arrs.emplace_back(shape, Context());
InitArray(&in_arrs.back());

// Get a reused version.
nnvm::TShape s(1);
s[0] = shape.Size();
NDArray arr(s, Context());
arr = arr.AsArray(shape, arr.dtype());
InitArray(&arr);
in_arrs.emplace_back(arr);

for (auto pd : pds) {
if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
continue;

in_arrs.emplace_back(shape, Context());
InitMKLDNNArray(&in_arrs.back(), pd, true);

// Get a reused version.
nnvm::TShape s(1);
s[0] = shape.Size();
arr = NDArray(s, Context());
arr = arr.AsArray(shape, arr.dtype());
InitMKLDNNArray(&arr, pd, true);
in_arrs.emplace_back(arr);
}
return in_arrs;
}

using VerifyFunc = std::function<void (const NDArray &in_arr, const NDArray &arr)>;

void VerifyCopyResult(const NDArray &in_arr, const NDArray &arr) {
NDArray tmp1 = in_arr.Reorder2Default();
NDArray tmp2 = arr.Reorder2Default();
EXPECT_EQ(tmp1.shape().Size(), tmp2.shape().Size());
TBlob d1 = tmp1.data();
TBlob d2 = tmp2.data();
EXPECT_EQ(memcmp(d1.dptr_, d2.dptr_,
tmp1.shape().Size() * sizeof(mshadow::default_real_t)), 0);
}

void TestUnaryOp(const OpAttrs &attrs, VerifyFunc verify_fn) {
std::vector<NDArray*> inputs(1);
std::vector<NDArray*> outputs(1);
std::vector<OpReqType> req(1);
std::vector<DispatchMode> dispatches = attrs.dispatches;

TestArrayShapes tas = GetTestArrayShapes();
std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;

std::vector<NDArray> in_arrs = GetTestInputArrays();
for (auto in_arr : in_arrs) {
for (auto dispatch : dispatches) {
std::vector<NDArray> out_arrs = GetTestOutputArrays(in_arr.shape(), pds);
for (auto out_arr : out_arrs) {
req[0] = kWriteTo;
inputs[0] = &in_arr;
outputs[0] = &out_arr;
Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs,
outputs, req, dispatch, mxnet::OpStatePtr());
out_arr.WaitToRead();
verify_fn(in_arr, out_arr);
}
}
}

for (auto dispatch : dispatches) {
in_arrs = GetTestInputArrays();
for (auto arr : in_arrs) {
// If the array is a view, we shouldn't write data to it.
if (arr.IsView())
continue;

NDArray orig = arr.Copy(arr.ctx());
req[0] = kWriteInplace;
inputs[0] = &arr;
outputs[0] = &arr;
Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs, outputs, req,
dispatch, mxnet::OpStatePtr());
arr.WaitToRead();
verify_fn(orig, arr);
}
}
}

TEST(IMPERATIVE, UnaryOp) {
OpAttrs attrs = GetCopyOp();
TestUnaryOp(attrs, VerifyCopyResult);
}

#endif

0 comments on commit 8d698dc

Please sign in to comment.