PaddlePaddle · jeff41404 · Aug 28, 2023 · Jul 6, 2023 · Jul 18, 2023 · Jul 25, 2023
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -262,6 +262,32 @@ paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
     VLOG(3) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str);
   }
 
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, x_autograd_meta, y_autograd_meta);
+
+  // Node Declaration
+  std::shared_ptr<MultiplyGradNode> grad_node;
+  // Set grad_node before API Call
+  if (require_any_grad) {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "multiply node_creation",
+        paddle::platform::TracerEventType::OperatorInner,
+        1);
+
+    grad_node = std::shared_ptr<MultiplyGradNode>(new MultiplyGradNode(1, 2));
+    // Set for forward trace
+    if (FLAGS_check_nan_inf) {
+      grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
+    }
+    // SetAttributes if needed
+    grad_node->SetAttributeaxis(-1);
+    // Set TensorWrappers for Forward Inputs if needed
+    auto x_clone = paddle::experimental::assign(x);
+    grad_node->SetTensorWrapperx(x_clone);
+    grad_node->SetTensorWrappery(y);
+  }
+
   // Forward API Call
   auto& api_result = paddle::experimental::multiply_(x, y);
   // Check NaN and Inf if needed
@@ -275,10 +301,6 @@ paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
 
   // Get Output AutoGradMeta
   egr::AutogradMeta* out_autograd_meta = egr::EagerUtils::autograd_meta(&out);
-  bool trace_backward = egr::Controller::Instance().HasGrad();
-  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
-      trace_backward, x_autograd_meta, y_autograd_meta);
-
   // Check Inplace if needed
 
   egr::EagerUtils::CheckInplace(x, x_autograd_meta, require_any_grad);
@@ -289,25 +311,7 @@ paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
 
   // Node Creation
   if (require_any_grad) {
-    paddle::platform::RecordEvent node_creation_record_event(
-        "multiply node_creation",
-        paddle::platform::TracerEventType::OperatorInner,
-        1);
-
     egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
-
-    // Node Construction
-    auto grad_node =
-        std::shared_ptr<MultiplyGradNode>(new MultiplyGradNode(1, 2));
-    // Set for forward trace
-    if (FLAGS_check_nan_inf) {
-      grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
-    }
-    // SetAttributes if needed
-    grad_node->SetAttributeaxis(-1);
-    // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperx(x);
-    grad_node->SetTensorWrappery(y);
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(x, 0);
     grad_node->SetGradOutMeta(y, 1);
@@ -429,7 +433,6 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
     input_str += input_y_str;
     VLOG(3) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str);
   }
-
   // Forward API Call
   auto api_result = paddle::experimental::sparse::multiply(x, y);
   // Check NaN and Inf if needed

diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -125,13 +125,14 @@
 
 - op : cast
   args : (Tensor x, DataType dtype)
-  output : Tensor
+  output : Tensor(out)
   infer_meta :
     func : CastInferMeta
   kernel :
     func : cast
     param : [x, dtype]
     data_type : x
+  inplace: (x -> out)
   backward : cast_grad
 
 - op : channel_shuffle
@@ -202,11 +203,12 @@
 
 - op : divide
   args : (Tensor x, Tensor y)
-  output : Tensor
+  output : Tensor(out)
   infer_meta :
     func : ElementwiseInferMeta
   kernel :
     func : divide
+  inplace: (x -> out)
   backward : divide_grad
 
 - op : dropout
@@ -293,6 +295,7 @@
     func : CompareInferMeta
   kernel :
     func : equal
+  inplace: (x -> out)
 
 - op : exponential_
   args : (Tensor x, float lam)
@@ -324,6 +327,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : floor_divide
+  inplace: (x -> out)
 
 - op : frobenius_norm
   args : (Tensor x, int64_t[] axis,  bool keep_dim,  bool reduce_all)
@@ -424,6 +428,7 @@
     func : CompareInferMeta
   kernel :
     func : greater_equal
+  inplace: (x -> out)
 
 - op : greater_than
   args : (Tensor x, Tensor y)
@@ -432,6 +437,7 @@
     func : CompareInferMeta
   kernel :
     func : greater_than
+  inplace: (x -> out)
 
 - op : hardswish
   args : (Tensor x)
@@ -470,6 +476,7 @@
     func : CompareInferMeta
   kernel :
     func : less_equal
+  inplace: (x -> out)
 
 - op : less_than
   args : (Tensor x, Tensor y)
@@ -478,6 +485,7 @@
     func : CompareInferMeta
   kernel :
     func : less_than
+  inplace: (x -> out)
 
 - op : linspace
   args : (Tensor start, Tensor stop, Tensor number, DataType dtype, Place place)
@@ -646,6 +654,7 @@
     func : CompareInferMeta
   kernel :
     func : not_equal
+  inplace: (x -> out)
 
 - op : one_hot
   args : (Tensor x, Scalar(int) num_classes)

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -330,6 +330,7 @@
   kernel :
     func : bitwise_and
     backend : x
+  inplace: (x -> out)
 
 - op : bitwise_not
   args : (Tensor x)
@@ -339,6 +340,7 @@
   kernel :
     func : bitwise_not
     backend : x
+  inplace: (x -> out)
 
 - op : bitwise_or
   args : (Tensor x, Tensor y)
@@ -348,6 +350,7 @@
   kernel :
     func : bitwise_or
     backend : x
+  inplace: (x -> out)
 
 - op : bitwise_xor
   args : (Tensor x, Tensor y)
@@ -357,6 +360,7 @@
   kernel :
     func : bitwise_xor
     backend : x
+  inplace: (x -> out)
 
 - op : bmm
   args : (Tensor x, Tensor y)
@@ -618,6 +622,7 @@
     func : UnchangedInferMetaCheckAxis
   kernel :
     func : cumprod
+  inplace: (x -> out)
   backward : cumprod_grad
 
 - op : cumsum
@@ -628,6 +633,7 @@
   kernel :
     func : cumsum
     data_type : x
+  inplace: (x -> out)
   backward : cumsum_grad
 
 - op : data
@@ -1524,6 +1530,7 @@
     func : logical_and
     data_type : x
     backend : x
+  inplace: (x -> out)
 
 - op : logical_not
   args : (Tensor x)
@@ -1534,6 +1541,7 @@
     func : logical_not
     data_type : x
     backend : x
+  inplace: (x -> out)
 
 - op : logical_or
   args : (Tensor x, Tensor y)
@@ -1544,6 +1552,7 @@
     func : logical_or
     data_type : x
     backend : x
+  inplace: (x -> out)
 
 - op : logical_xor
   args : (Tensor x, Tensor y)
@@ -1554,6 +1563,7 @@
     func : logical_xor
     data_type : x
     backend : x
+  inplace: (x -> out)
 
 - op : logit
   args : (Tensor x, float eps = 1e-6f)
@@ -2073,12 +2083,13 @@
 
 - op : renorm
   args : (Tensor x, float p, int axis, float max_norm)
-  output : Tensor
+  output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
     param : [x]
   kernel :
     func : renorm
+  inplace: (x -> out)
   backward : renorm_grad
 
 - op : reverse
@@ -2788,11 +2799,12 @@
 
 - op : where
   args : (Tensor condition, Tensor x, Tensor y)
-  output : Tensor
+  output : Tensor(out)
   infer_meta :
     func : WhereInferMeta
   kernel :
     func : where
+  inplace: (x -> out)
   backward : where_grad
 
 - op : yolo_box

diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
@@ -201,6 +201,10 @@ bool MetaTensor::is_selected_rows() const {
 }
 bool MetaTensor::is_tensor_array() const { return false; }
 
+bool MetaTensor::is_same_tensor(const MetaTensor& meta_tensor) const {
+  return tensor_ != nullptr && tensor_ == meta_tensor.tensor();
+}
+
 void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   ValidCheck(*this);
   bool is_dense_tensor = phi::DenseTensor::classof(tensor_);

@@ -86,6 +86,8 @@ class MetaTensor {
   //  and it will be deleted in the future.
   virtual bool is_tensor_array() const;
 
+  virtual bool is_same_tensor(const MetaTensor& meta_tensor) const;
+
   virtual operator unspecified_bool_type() const {
     return tensor_ == nullptr ? 0 : unspecified_bool_true;
   }

diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
@@ -380,8 +380,9 @@ void CompareRawInferMeta(const MetaTensor& x,
     out->set_dims(make_ddim(out_dims_array));
     out->share_lod(x);
   }
-
-  out->set_dtype(DataType::BOOL);
+  if (!out->is_same_tensor(x)) {
+    out->set_dtype(DataType::BOOL);
+  }
 }
 
 void CompareInferMeta(const MetaTensor& x,

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
@@ -384,9 +384,14 @@ void BatchSizeLikeInferMeta(const MetaTensor& x,
 
 void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
   out->set_dims(x.dims());
-  out->set_dtype(out_dtype);
   out->set_layout(x.layout());
   out->share_lod(x);
+  // In inpalce case, setting the dtype of out will reset the dtype of x at the
+  // same time, which will cause bugs, so move the dtype setting of out to the
+  // kernel
+  if (!(out->is_same_tensor(x))) {
+    out->set_dtype(out_dtype);
+  }
 }
 
 void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) {

diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
@@ -26,7 +26,8 @@ void CastGradKernel(const Context& dev_ctx,
                     const DenseTensor& out_grad,
                     DenseTensor* x_grad) {
   PD_VISIT_ALL_TYPES(x.dtype(), "CastKernelImpl", ([&] {
-                       CastKernelImpl<T, data_t>(dev_ctx, out_grad, x_grad);
+                       CastKernelImpl<T, data_t>(
+                           dev_ctx, out_grad, x_grad->dtype(), x_grad);
                      }));
 }
 

diff --git a/paddle/phi/kernels/cpu/cast_impl.h b/paddle/phi/kernels/cpu/cast_impl.h
@@ -29,12 +29,35 @@ struct CastOpTransformFunctor {
 template <typename InT, typename OutT>
 void CastKernelImpl(const CPUContext& dev_ctx,
                     const DenseTensor& x,
+                    DataType out_dtype,
                     DenseTensor* out) {
   auto* in_begin = x.data<InT>();
   auto numel = x.numel();
   auto* in_end = in_begin + numel;
 
   auto* out_begin = dev_ctx.Alloc<OutT>(out);
+  out->set_type(out_dtype);
+
+  phi::Transform<CPUContext> trans;
+  trans(dev_ctx,
+        in_begin,
+        in_end,
+        out_begin,
+        CastOpTransformFunctor<InT, OutT>());
+}
+
+template <typename InT, typename OutT>
+void CastInplaceKernelImpl(const CPUContext& dev_ctx,
+                           const DenseTensor& x,
+                           DataType out_dtype,
+                           DenseTensor* out) {
+  auto x_origin = x;
+  auto* in_begin = x_origin.data<InT>();
+  auto numel = x_origin.numel();
+  auto* in_end = in_begin + numel;
+
+  auto* out_begin = dev_ctx.Alloc<OutT>(out);
+  out->set_type(out_dtype);
 
   phi::Transform<CPUContext> trans;
   trans(dev_ctx,

diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -25,9 +25,16 @@ void CastKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DataType out_dtype,
                 DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       CastKernelImpl<T, data_t>(dev_ctx, x, out);
-                     }));
+  if (out->IsSharedWith(x)) {
+    PD_VISIT_ALL_TYPES(out_dtype, "CastInplaceKernelImpl", ([&] {
+                         CastInplaceKernelImpl<T, data_t>(
+                             dev_ctx, x, out_dtype, out);
+                       }));
+  } else {
+    PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
+                         CastKernelImpl<T, data_t>(dev_ctx, x, out_dtype, out);
+                       }));
+  }
 }
 
 }  // namespace phi