From 239b830f9939ca706d8b0e38a502d81ede3572cf Mon Sep 17 00:00:00 2001
From: Zhan Rongrui <46243324+zrr1999@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:11:03 +0800
Subject: [PATCH 01/55] =?UTF-8?q?[PIR]=20A-20=E3=80=81B-9=E3=80=81B-10=20A?=
 =?UTF-8?q?dapt=20test=5Ferrors=20(#62118)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/legacy_test/test_activation_op.py | 39 ++++++++++++++++----------
 test/legacy_test/test_full_like_op.py  |  6 ++--
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index deecf7fd09a9e..45c79e6aba5c9 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -40,9 +40,12 @@ def dynamic_guard():
 
 
 class TestSqrtOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
         with static_guard():
-            with program_guard(Program(), Program()):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
                 # The input type of sqrt op must be Variable or numpy.ndarray.
                 in1 = 1
                 self.assertRaises(TypeError, paddle.sqrt, in1)
@@ -643,6 +646,7 @@ def test_dygraph_api(self):
             np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
         paddle.enable_static()
 
+    @test_with_pir_api
     def test_errors(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
@@ -890,6 +894,7 @@ def test_dygraph_api(self):
             for r in [out1, out2, out3]:
                 np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
 
+    @test_with_pir_api
     def test_errors(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
@@ -2702,22 +2707,24 @@ def test_dygraph_api(self):
             for r in [out1, out2]:
                 np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
 
+    @test_with_pir_api
     def test_errors(self):
         with static_guard():
-            with static_guard():
-                with paddle.static.program_guard(paddle.static.Program()):
-                    # The input type must be Variable.
-                    self.assertRaises(TypeError, self.relu, 1)
-                    # The input dtype must be float16, float32, float64.
-                    x_int32 = paddle.static.data(
-                        name='x_int32', shape=[10, 12], dtype='int32'
-                    )
-                    self.assertRaises(TypeError, self.relu, x_int32)
-                    # support the input dtype is float16
-                    x_fp16 = paddle.static.data(
-                        name='x_fp16', shape=[10, 12], dtype='float16'
-                    )
-                    self.relu(x_fp16)
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                # The input type must be Variable.
+                self.assertRaises(TypeError, self.relu, 1)
+                # The input dtype must be float16, float32, float64.
+                x_int32 = paddle.static.data(
+                    name='x_int32', shape=[10, 12], dtype='int32'
+                )
+                self.assertRaises(TypeError, self.relu, x_int32)
+                # support the input dtype is float16
+                x_fp16 = paddle.static.data(
+                    name='x_fp16', shape=[10, 12], dtype='float16'
+                )
+                self.relu(x_fp16)
 
 
 class TestReluInplaceAPI(TestReluAPI):
@@ -2846,6 +2853,7 @@ def test_dygraph_api(self):
             for r in [out1, out2]:
                 np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
 
+    @test_with_pir_api
     def test_errors(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
@@ -3029,6 +3037,7 @@ def test_dygraph_api(self):
             for r in [out1, out2]:
                 np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
 
+    @test_with_pir_api
     def test_errors(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
diff --git a/test/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py
index 9f327b0b0107a..81322bd431c31 100644
--- a/test/legacy_test/test_full_like_op.py
+++ b/test/legacy_test/test_full_like_op.py
@@ -23,7 +23,6 @@
 from paddle.base.framework import convert_np_dtype_to_dtype_
 from paddle.framework import in_pir_mode
 from paddle.pir_utils import test_with_pir_api
-from paddle.static import Program, program_guard
 
 
 def fill_any_like_wrapper(x, value, out_dtype=None, name=None):
@@ -98,8 +97,11 @@ def test_full_like_fill_inf(self):
 
 
 class TestFullOpError(unittest.TestCase):
+    @test_with_pir_api
     def test_errors(self):
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             # for ci coverage
 
             input_data = paddle.static.data(

From 73f9671b168fc8f01480e7886bd5dbc98f54cff2 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 29 Feb 2024 14:23:57 +0800
Subject: [PATCH 02/55] [Inference] Export pir&pass headers for inference lib
 (#61863)

* export pir&pass headers in inference

* fix

* final
---
 cmake/cuda.cmake                              |  2 +-
 ...eader.cmake => export_paddle_header.cmake} | 46 +++++++++++++-----
 cmake/inference_lib.cmake                     | 48 +++++++++++++++++--
 paddle/cinn/hlir/framework/pir/op_mapper.h    |  3 ++
 paddle/extension.h                            | 23 +++++++++
 .../inference/api/demo_ci/CMakeLists.txt      |  2 +-
 .../fluid/pir/dialect/kernel/ir/kernel_op.cc  |  4 +-
 paddle/fluid/pir/drr/src/pattern_graph.cc     |  4 +-
 paddle/fluid/pir/drr/src/pattern_graph.h      |  2 +-
 paddle/fluid/pir/drr/src/rewrite_pattern.cc   |  7 +--
 paddle/phi/api/all.h                          |  5 --
 paddle/pir/include/core/block_argument.h      |  1 +
 .../pir/include/core/builtin_type_storage.h   |  2 +
 paddle/pir/include/core/interface_support.h   |  3 +-
 paddle/pir/include/core/interface_value.h     |  2 +
 paddle/pir/include/core/ir_context.h          |  1 +
 paddle/pir/include/core/ir_mapping.h          |  2 +
 paddle/pir/include/core/iterator.h            |  3 ++
 paddle/pir/include/core/op_base.h             |  1 +
 paddle/pir/include/core/op_info.h             |  1 +
 paddle/pir/include/core/op_operand.h          |  1 +
 paddle/pir/include/core/op_result.h           |  1 +
 paddle/pir/include/core/operation_utils.h     |  1 +
 paddle/pir/include/core/parameter.h           |  2 +
 .../include/core/storage_manager_support.h    |  1 +
 paddle/pir/include/core/type.h                |  1 +
 paddle/pir/include/core/type_id.h             |  1 -
 paddle/pir/include/core/visitors.h            |  1 +
 .../include/dialect/control_flow/ir/cf_op.h   |  2 +
 .../pir/include/dialect/shape/ir/shape_op.h   |  1 +
 paddle/pir/include/pass/pass.h                |  8 +---
 paddle/pir/src/core/block.cc                  |  1 +
 paddle/pir/src/core/block_argument.cc         |  2 +
 paddle/pir/src/core/builder.cc                |  2 +
 paddle/pir/src/core/builtin_op.cc             |  4 +-
 paddle/pir/src/core/dialect.cc                |  2 +
 paddle/pir/src/core/ir_context.cc             |  1 +
 paddle/pir/src/core/op_info_impl.cc           |  4 +-
 paddle/pir/src/core/op_result_impl.cc         |  4 +-
 paddle/pir/src/core/op_trait.cc               |  4 +-
 paddle/pir/src/core/operation.cc              |  1 +
 paddle/pir/src/core/storage_manager.cc        |  1 +
 paddle/pir/src/core/value_impl.cc             |  2 +
 .../pir/src/dialect/control_flow/ir/cf_op.cc  |  4 +-
 paddle/pir/src/pass/print_statistics.cc       |  2 +
 .../pattern_rewrite/pattern_rewrite_driver.cc |  1 +
 .../utils/cpp_extension/cpp_extension.py      |  2 +-
 .../utils/cpp_extension/extension_utils.py    |  6 +--
 python/setup.py.in                            |  8 +++-
 setup.py                                      | 21 ++++++++
 test/cpp/pir/tools/test_op.h                  |  2 +
 51 files changed, 208 insertions(+), 48 deletions(-)
 rename cmake/{phi_header.cmake => export_paddle_header.cmake} (52%)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 81a7228629d25..e0a2a7eb34739 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -294,7 +294,7 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA NVCC_ARCH_BIN)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")
 
-# Set C++14 support
+# Set C++17 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
diff --git a/cmake/phi_header.cmake b/cmake/export_paddle_header.cmake
similarity index 52%
rename from cmake/phi_header.cmake
rename to cmake/export_paddle_header.cmake
index ac633b747bcef..9b139da98ad2d 100644
--- a/cmake/phi_header.cmake
+++ b/cmake/export_paddle_header.cmake
@@ -15,33 +15,57 @@
 set(PADDLE_INFERENCE_INSTALL_DIR
     "${CMAKE_BINARY_DIR}/paddle_inference_install_dir")
 
-function(phi_header_path_compat TARGET_PATH)
-  message(STATUS "phi header path compat processing: ${TARGET_PATH}")
+function(header_path_compat TARGET_PATH)
+  message(STATUS "header path compat processing: ${TARGET_PATH}")
   file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
   foreach(header ${HEADERS})
     if(${header} MATCHES ".*.h$")
       file(READ ${header} HEADER_CONTENT)
       string(REPLACE "paddle/fluid/platform/" "paddle/phi/" HEADER_CONTENT
                      "${HEADER_CONTENT}")
+      string(REPLACE "paddle/pir/include/" "paddle/pir/" HEADER_CONTENT
+                     "${HEADER_CONTENT}")
+      string(REPLACE "paddle/fluid/pir/drr/include/" "paddle/pir/drr/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      string(REPLACE "paddle/fluid/pir/transforms/" "paddle/pir/transforms/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
       file(WRITE ${header} "${HEADER_CONTENT}")
-      message(STATUS "phi header path compat processing complete: ${header}")
+      message(STATUS "header path compat processing complete: ${header}")
     endif()
   endforeach()
 endfunction()
 
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle)
-phi_header_path_compat(
-  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi)
-phi_header_path_compat(
+header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle)
+header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi)
+header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/api)
-phi_header_path_compat(
+header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/api/ext)
-phi_header_path_compat(
+header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/api/include)
-phi_header_path_compat(
+header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/common)
-phi_header_path_compat(
+header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/core)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/core)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/core/parser)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/dialect/control_flow/ir
+)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/dialect/shape/ir)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/dialect/shape/utils)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/drr)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/pass)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/pattern_rewrite)
+header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/transforms)
 
 # NOTE(liuyuanle): In inference lib, no need include paddle/utils/pybind.h, so we delete this.
 file(READ ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/extension.h
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index f4a8286985094..7db3a7de046fd 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -354,12 +354,54 @@ copy(
   SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/)
 
-# the include path of phi needs to be changed to adapt to inference api path
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/core/parser/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/core/parser/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/core/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/core/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/dialect/control_flow/ir/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/dialect/control_flow/ir/
+)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/dialect/shape/ir/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/dialect/shape/ir/
+)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/dialect/shape/utils/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/dialect/shape/utils/
+)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/pass/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/pass/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/pir/include/pattern_rewrite/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/pattern_rewrite/
+)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/drr/include/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/drr/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/transforms/transform_general_functions.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/transforms/)
+
+# the include path of paddle needs to be changed to adapt to inference api path
 add_custom_command(
   TARGET inference_lib_dist
   POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake"
-  COMMENT "Change phi header include path to adapt to inference api path")
+  COMMAND ${CMAKE_COMMAND} -P
+          "${PADDLE_SOURCE_DIR}/cmake/export_paddle_header.cmake"
+  COMMENT "Change paddle header include path to adapt to inference api path")
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR
diff --git a/paddle/cinn/hlir/framework/pir/op_mapper.h b/paddle/cinn/hlir/framework/pir/op_mapper.h
index 73e8d9581e4b0..87053a8c02d53 100644
--- a/paddle/cinn/hlir/framework/pir/op_mapper.h
+++ b/paddle/cinn/hlir/framework/pir/op_mapper.h
@@ -13,9 +13,12 @@
 // limitations under the License.
 
 #pragma once
+
+#include <glog/logging.h>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/cinn/utils/type_defs.h"
 #include "paddle/pir/include/core/operation.h"
 
diff --git a/paddle/extension.h b/paddle/extension.h
index 3c79adcde5d69..f3c6e0a1b15f9 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -14,12 +14,35 @@ limitations under the License. */
 
 #pragma once
 
+#if defined(__clang__) || defined(__GNUC__)
+#define CPP_STANDARD __cplusplus
+#elif defined(_MSC_VER)
+#define CPP_STANDARD _MSVC_LANG
+#endif
+
 #ifndef CUSTOM_OP_WITH_SPMD
 #define CUSTOM_OP_WITH_SPMD
 #endif
 
 // All paddle apis in C++ frontend
+// phi headers
 #include "paddle/phi/api/all.h"
+// common headers
+#include "paddle/common/ddim.h"
+#include "paddle/common/exception.h"
+#include "paddle/common/layout.h"
+
+#if CPP_STANDARD >= 201703L && !defined(__clang__)
+// pir&pass headers
+#include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
+#include "paddle/pir/include/core/operation.h"
+#include "paddle/pir/include/core/type.h"
+#include "paddle/pir/include/core/value.h"
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_manager.h"
+#include "paddle/pir/include/pattern_rewrite/pattern_match.h"
+#endif
+
 #if !defined(PADDLE_ON_INFERENCE) && !defined(PADDLE_NO_PYTHON)
 // Python bindings for the C++ frontend (includes Python.h)
 #include "paddle/utils/pybind.h"
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 727af4e00605e..1206ac1fd6859 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -85,7 +85,7 @@ else()
   if(WITH_MKL)
     set(FLAG_OPENMP "-fopenmp")
   endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 ${FLAG_OPENMP}")
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
index 0c8f007a51a9d..c3e44d4e3ef35 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include <glog/logging.h>
+
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 
diff --git a/paddle/fluid/pir/drr/src/pattern_graph.cc b/paddle/fluid/pir/drr/src/pattern_graph.cc
index a8c72a064d0b8..eccbb30dea890 100644
--- a/paddle/fluid/pir/drr/src/pattern_graph.cc
+++ b/paddle/fluid/pir/drr/src/pattern_graph.cc
@@ -147,7 +147,7 @@ void GraphTopo::WalkGraphNodesTopoOrder(
   const std::unordered_set<std::string> &inputs_tensor =
       graph_->input_tensors();
   const std::unordered_map<std::string, std::shared_ptr<Tensor>>
-      &id2owned_tensor = graph_->id2owend_tensor();
+      &id2owned_tensor = graph_->id2owned_tensor();
   const std::vector<std::shared_ptr<OpCall>> &owend_opcall =
       graph_->owned_op_call();
 
@@ -202,7 +202,7 @@ void GraphTopo::WalkGraphNodesTopoOrder(
 
 std::ostream &operator<<(std::ostream &os, const PatternGraph &pattern_graph) {
   os << "\nAll Tensors:\n";
-  for (const auto &kv : pattern_graph.id2owend_tensor()) {
+  for (const auto &kv : pattern_graph.id2owned_tensor()) {
     os << "  " << kv.first;
   }
   os << "\n\n";
diff --git a/paddle/fluid/pir/drr/src/pattern_graph.h b/paddle/fluid/pir/drr/src/pattern_graph.h
index e5cd74b2fa217..7243c99bfc853 100644
--- a/paddle/fluid/pir/drr/src/pattern_graph.h
+++ b/paddle/fluid/pir/drr/src/pattern_graph.h
@@ -57,7 +57,7 @@ class PatternGraph {
   }
 
   const std::unordered_map<std::string, std::shared_ptr<Tensor>>&
-  id2owend_tensor() const {
+  id2owned_tensor() const {
     return id2owned_tensor_;
   }
 
diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
index 68a7b14f81a3e..04390126ddddf 100644
--- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
 #include <queue>
 
 #include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
@@ -414,13 +415,13 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
   // add input tensors info for res_match_ctx
   for (const auto& in_tensor : result_pattern_graph.input_tensors()) {
     PADDLE_ENFORCE_NE(
-        result_pattern_graph.id2owend_tensor().count(in_tensor),
+        result_pattern_graph.id2owned_tensor().count(in_tensor),
         0,
         phi::errors::NotFound("Not found the input tensor."
                               "Drr input tensor [%s] must exist in the result "
                               "pattern graph to be obtained.",
                               in_tensor));
-    if (!result_pattern_graph.id2owend_tensor().at(in_tensor)->is_none()) {
+    if (!result_pattern_graph.id2owned_tensor().at(in_tensor)->is_none()) {
       res_match_ctx.BindIrValue(in_tensor, src_match_ctx.GetIrValue(in_tensor));
     }
   }
@@ -508,7 +509,7 @@ void DrrRewritePattern::ReplaceOutputTensor(
     const MatchContextImpl& res_match_ctx,
     pir::PatternRewriter& rewriter) const {  // NOLINT
   for (const auto& output_name : result_pattern_graph_->output_tensors()) {
-    if (source_pattern_graph_->id2owend_tensor().count(output_name)) {
+    if (source_pattern_graph_->id2owned_tensor().count(output_name)) {
       const auto& src_ir_tensor = src_match_ctx.GetIrValue(output_name);
       const auto& res_ir_tensor = res_match_ctx.GetIrValue(output_name);
       rewriter.ReplaceAllUsesWith(src_ir_tensor, res_ir_tensor);
diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 93c97605f9f3f..aaafec306401a 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -38,8 +38,3 @@ limitations under the License. */
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/ext/tensor_compat.h"
-
-// common headers
-#include "paddle/common/ddim.h"
-#include "paddle/common/exception.h"
-#include "paddle/common/layout.h"
diff --git a/paddle/pir/include/core/block_argument.h b/paddle/pir/include/core/block_argument.h
index 3ddf7847fd8a2..b3b8c78660c34 100644
--- a/paddle/pir/include/core/block_argument.h
+++ b/paddle/pir/include/core/block_argument.h
@@ -16,6 +16,7 @@
 
 #include "paddle/pir/include/core/operation_utils.h"
 #include "paddle/pir/include/core/value.h"
+
 namespace pir {
 class Block;
 
diff --git a/paddle/pir/include/core/builtin_type_storage.h b/paddle/pir/include/core/builtin_type_storage.h
index 03f06279a0dfd..f706e0c66277e 100644
--- a/paddle/pir/include/core/builtin_type_storage.h
+++ b/paddle/pir/include/core/builtin_type_storage.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <cstring>
+
 #include "paddle/common/ddim.h"
 #include "paddle/common/dim.h"
 #include "paddle/common/hash_funcs.h"
diff --git a/paddle/pir/include/core/interface_support.h b/paddle/pir/include/core/interface_support.h
index a035114e44bf2..12d419b3291c6 100644
--- a/paddle/pir/include/core/interface_support.h
+++ b/paddle/pir/include/core/interface_support.h
@@ -19,6 +19,7 @@
 
 namespace pir {
 namespace detail {
+
 template <typename ConcreteT, typename... Args>
 class ConstructInterfacesOrTraits {
  public:
@@ -45,14 +46,12 @@ class ConstructInterfacesOrTraits {
     IR_ENFORCE(suceess,
                "Interface: id[%u] is already registered. inset failed",
                TypeId::get<T>());
-    VLOG(10) << "New a interface: id[" << TypeId::get<T>() << "].";
   }
 
   /// Placement new trait.
   template <typename T>
   static void PlacementConstrctTrait(pir::TypeId *&p_trait) {  // NOLINT
     *p_trait = TypeId::get<T>();
-    VLOG(10) << "New a trait: id[" << *p_trait << "].";
     ++p_trait;
   }
 };
diff --git a/paddle/pir/include/core/interface_value.h b/paddle/pir/include/core/interface_value.h
index 00f8cc289143f..64619a0e0f591 100644
--- a/paddle/pir/include/core/interface_value.h
+++ b/paddle/pir/include/core/interface_value.h
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include <set>
 #include <type_traits>
+
 #include "paddle/pir/include/core/type_id.h"
 #include "paddle/pir/include/core/utils.h"
 
diff --git a/paddle/pir/include/core/ir_context.h b/paddle/pir/include/core/ir_context.h
index dbf7ff4cdd73e..914fecc60a056 100644
--- a/paddle/pir/include/core/ir_context.h
+++ b/paddle/pir/include/core/ir_context.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <functional>
 #include <memory>
 #include <set>
diff --git a/paddle/pir/include/core/ir_mapping.h b/paddle/pir/include/core/ir_mapping.h
index 83994ea284570..e67c507059b17 100644
--- a/paddle/pir/include/core/ir_mapping.h
+++ b/paddle/pir/include/core/ir_mapping.h
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+
 #include <unordered_map>
+
 #include "paddle/common/enforce.h"
 #include "paddle/pir/include/core/value.h"
 
diff --git a/paddle/pir/include/core/iterator.h b/paddle/pir/include/core/iterator.h
index 8fbfae8cb4b2d..fc88d981c3661 100644
--- a/paddle/pir/include/core/iterator.h
+++ b/paddle/pir/include/core/iterator.h
@@ -13,9 +13,12 @@
 // limitations under the License.
 
 #pragma once
+
 #include <iterator>
 #include <list>
+
 #include "paddle/common/macros.h"
+
 namespace pir {
 
 class Operation;
diff --git a/paddle/pir/include/core/op_base.h b/paddle/pir/include/core/op_base.h
index 93e6939be8adf..698f65c791dbe 100644
--- a/paddle/pir/include/core/op_base.h
+++ b/paddle/pir/include/core/op_base.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <type_traits>
 
 #include "paddle/common/enforce.h"
diff --git a/paddle/pir/include/core/op_info.h b/paddle/pir/include/core/op_info.h
index fbeb679463a4d..124ed660db0f4 100644
--- a/paddle/pir/include/core/op_info.h
+++ b/paddle/pir/include/core/op_info.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <functional>
 #include <unordered_map>
 
diff --git a/paddle/pir/include/core/op_operand.h b/paddle/pir/include/core/op_operand.h
index 5366ab390ffa0..4944c31fdb283 100644
--- a/paddle/pir/include/core/op_operand.h
+++ b/paddle/pir/include/core/op_operand.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <cstdint>
 #include "paddle/pir/include/core/dll_decl.h"
 
diff --git a/paddle/pir/include/core/op_result.h b/paddle/pir/include/core/op_result.h
index 04ae0e848e511..58af7c1a81e97 100644
--- a/paddle/pir/include/core/op_result.h
+++ b/paddle/pir/include/core/op_result.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/pir/include/core/value.h"
+
 namespace pir {
 
 namespace detail {
diff --git a/paddle/pir/include/core/operation_utils.h b/paddle/pir/include/core/operation_utils.h
index 4360af17e08a4..891f109eaa8a2 100644
--- a/paddle/pir/include/core/operation_utils.h
+++ b/paddle/pir/include/core/operation_utils.h
@@ -16,6 +16,7 @@
 
 #include <initializer_list>
 #include <memory>
+
 #include "paddle/pir/include/core/attribute.h"
 #include "paddle/pir/include/core/dll_decl.h"
 #include "paddle/pir/include/core/op_info.h"
diff --git a/paddle/pir/include/core/parameter.h b/paddle/pir/include/core/parameter.h
index cad6839ea8851..bfcbe17b3289c 100644
--- a/paddle/pir/include/core/parameter.h
+++ b/paddle/pir/include/core/parameter.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <cstring>
+
 #include "paddle/pir/include/core/type.h"
 
 namespace pir {
diff --git a/paddle/pir/include/core/storage_manager_support.h b/paddle/pir/include/core/storage_manager_support.h
index 9952d2d144d66..7d4d540382dcd 100644
--- a/paddle/pir/include/core/storage_manager_support.h
+++ b/paddle/pir/include/core/storage_manager_support.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <set>
+
 #include "paddle/pir/include/core/interface_support.h"
 #include "paddle/pir/include/core/ir_context.h"
 #include "paddle/pir/include/core/type.h"
diff --git a/paddle/pir/include/core/type.h b/paddle/pir/include/core/type.h
index 98ef867bef49b..569b356135b18 100644
--- a/paddle/pir/include/core/type.h
+++ b/paddle/pir/include/core/type.h
@@ -19,6 +19,7 @@
 #include "paddle/pir/include/core/cast_utils.h"
 #include "paddle/pir/include/core/storage_manager_support.h"
 #include "paddle/pir/include/core/type_id.h"
+
 namespace pir {
 class TypeStorage;
 class AbstractType;
diff --git a/paddle/pir/include/core/type_id.h b/paddle/pir/include/core/type_id.h
index b6e107c777559..2bce5d92752d2 100644
--- a/paddle/pir/include/core/type_id.h
+++ b/paddle/pir/include/core/type_id.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include <glog/logging.h>
 #include <functional>
 
 #include "paddle/pir/include/core/dll_decl.h"
diff --git a/paddle/pir/include/core/visitors.h b/paddle/pir/include/core/visitors.h
index c2cf137e44624..31f0262865127 100644
--- a/paddle/pir/include/core/visitors.h
+++ b/paddle/pir/include/core/visitors.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <functional>
+
 #include "paddle/pir/include/core/dll_decl.h"
 
 namespace pir {
diff --git a/paddle/pir/include/dialect/control_flow/ir/cf_op.h b/paddle/pir/include/dialect/control_flow/ir/cf_op.h
index 0d6e60a017ab3..e01dec38ce73c 100644
--- a/paddle/pir/include/dialect/control_flow/ir/cf_op.h
+++ b/paddle/pir/include/dialect/control_flow/ir/cf_op.h
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #pragma once
+
 #include <functional>
+
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/op_base.h"
 #include "paddle/pir/include/core/op_trait.h"
diff --git a/paddle/pir/include/dialect/shape/ir/shape_op.h b/paddle/pir/include/dialect/shape/ir/shape_op.h
index 84440d64abc43..3bc7562eaf0e4 100644
--- a/paddle/pir/include/dialect/shape/ir/shape_op.h
+++ b/paddle/pir/include/dialect/shape/ir/shape_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <optional>
+
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_type_interfaces.h"
 #include "paddle/pir/include/core/ir_printer.h"
diff --git a/paddle/pir/include/pass/pass.h b/paddle/pir/include/pass/pass.h
index 3be04b71051f7..bdd530782c034 100644
--- a/paddle/pir/include/pass/pass.h
+++ b/paddle/pir/include/pass/pass.h
@@ -136,23 +136,17 @@ class IR_API Pass {
   // Set a pointer to the attribute. Pass takes ownership of the attribute.
   template <typename AttrType>
   void Set(const std::string& attr_name, AttrType* attr) {
-    VLOG(3) << "Setting the attribute " << attr_name << " for the pass "
-            << name();
     if (Has(attr_name)) {
       Erase(attr_name);
     }
     attrs_[attr_name] = attr;
-    attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(8) << "deleting " << attr_name;
-      delete attr;
-    };
+    attr_dels_[attr_name] = [attr, attr_name]() { delete attr; };
   }
 
   // Set a pointer to the attribute. Pass doesn't take ownership. Caller
   // should delete the attribute.
   template <typename AttrType>
   void SetNotOwned(const std::string& attr_name, AttrType* attr) {
-    VLOG(3) << "Setting the attribute " << attr_name << " for the " << name();
     IR_ENFORCE(
         !Has(attr_name), "Attribute %s already set in the pass.", attr_name);
     attrs_[attr_name] = attr;
diff --git a/paddle/pir/src/core/block.cc b/paddle/pir/src/core/block.cc
index 258f681b303cb..39b347dfe81b4 100644
--- a/paddle/pir/src/core/block.cc
+++ b/paddle/pir/src/core/block.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/pir/include/core/block.h"
 
+#include <glog/logging.h>
 #include <unordered_set>
 
 #include "paddle/common/enforce.h"
diff --git a/paddle/pir/src/core/block_argument.cc b/paddle/pir/src/core/block_argument.cc
index 99a799e9f592e..1966aa191476a 100644
--- a/paddle/pir/src/core/block_argument.cc
+++ b/paddle/pir/src/core/block_argument.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/block_argument.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/operation_utils.h"
diff --git a/paddle/pir/src/core/builder.cc b/paddle/pir/src/core/builder.cc
index 80147428922ba..2b6d000b8639e 100644
--- a/paddle/pir/src/core/builder.cc
+++ b/paddle/pir/src/core/builder.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_type.h"
diff --git a/paddle/pir/src/core/builtin_op.cc b/paddle/pir/src/core/builtin_op.cc
index 24b7624dafc63..fca2ebe63eea5 100644
--- a/paddle/pir/src/core/builtin_op.cc
+++ b/paddle/pir/src/core/builtin_op.cc
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pir/include/core/builtin_op.h"
+#include <glog/logging.h>
+
 #include "paddle/common/enforce.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/builtin_type.h"
 
 namespace pir {
diff --git a/paddle/pir/src/core/dialect.cc b/paddle/pir/src/core/dialect.cc
index b09709da6b0db..668c56111d0ac 100644
--- a/paddle/pir/src/core/dialect.cc
+++ b/paddle/pir/src/core/dialect.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/dialect.h"
 
 namespace pir {
diff --git a/paddle/pir/src/core/ir_context.cc b/paddle/pir/src/core/ir_context.cc
index a4839bb2d4a34..90393fe4370b9 100644
--- a/paddle/pir/src/core/ir_context.cc
+++ b/paddle/pir/src/core/ir_context.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/pir/include/core/ir_context.h"
 
+#include <glog/logging.h>
 #include <unordered_map>
 
 #include "paddle/pir/include/core/attribute_base.h"
diff --git a/paddle/pir/src/core/op_info_impl.cc b/paddle/pir/src/core/op_info_impl.cc
index efbcedf42cc0f..f9d5295671113 100644
--- a/paddle/pir/src/core/op_info_impl.cc
+++ b/paddle/pir/src/core/op_info_impl.cc
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pir/src/core/op_info_impl.h"
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/dialect.h"
 #include "paddle/pir/include/core/interface_support.h"
+#include "paddle/pir/src/core/op_info_impl.h"
 
 namespace pir {
 
diff --git a/paddle/pir/src/core/op_result_impl.cc b/paddle/pir/src/core/op_result_impl.cc
index 3bc9e5023b3b2..dd895cc04d10d 100644
--- a/paddle/pir/src/core/op_result_impl.cc
+++ b/paddle/pir/src/core/op_result_impl.cc
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pir/src/core/op_result_impl.h"
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/operation.h"
+#include "paddle/pir/src/core/op_result_impl.h"
 
 namespace pir {
 namespace detail {
diff --git a/paddle/pir/src/core/op_trait.cc b/paddle/pir/src/core/op_trait.cc
index 4261dbcc8a457..39a0f6001da18 100644
--- a/paddle/pir/src/core/op_trait.cc
+++ b/paddle/pir/src/core/op_trait.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pir/include/core/op_trait.h"
+#include <glog/logging.h>
+
 #include "paddle/common/enforce.h"
+#include "paddle/pir/include/core/op_trait.h"
 #include "paddle/pir/include/core/type_utils.h"
 
 namespace {
diff --git a/paddle/pir/src/core/operation.cc b/paddle/pir/src/core/operation.cc
index e7dce069ebd81..923316c765245 100644
--- a/paddle/pir/src/core/operation.cc
+++ b/paddle/pir/src/core/operation.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
 #include <cstdint>
 #include <ostream>
 
diff --git a/paddle/pir/src/core/storage_manager.cc b/paddle/pir/src/core/storage_manager.cc
index 6018917062d43..a6fb1621292a6 100644
--- a/paddle/pir/src/core/storage_manager.cc
+++ b/paddle/pir/src/core/storage_manager.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/pir/include/core/storage_manager.h"
 
+#include <glog/logging.h>
 #include <memory>
 #include <unordered_map>
 
diff --git a/paddle/pir/src/core/value_impl.cc b/paddle/pir/src/core/value_impl.cc
index 37dcb48370b6e..5b37e24e8240d 100644
--- a/paddle/pir/src/core/value_impl.cc
+++ b/paddle/pir/src/core/value_impl.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
+
 #include "paddle/pir/src/core/value_impl.h"
 
 namespace {
diff --git a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
index 3ead6991b272a..8b4cf4727df5b 100644
--- a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
+++ b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/builtin_type.h"
 #include "paddle/pir/include/core/ir_printer.h"
+#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
 #include "paddle/pir/include/dialect/control_flow/ir/cf_type.h"
 
 namespace pir {
diff --git a/paddle/pir/src/pass/print_statistics.cc b/paddle/pir/src/pass/print_statistics.cc
index 2b92c9e4cc9f6..21d4d67945ce8 100644
--- a/paddle/pir/src/pass/print_statistics.cc
+++ b/paddle/pir/src/pass/print_statistics.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
+
 #include "paddle/common/macros.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/pass/pass.h"
diff --git a/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc b/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc
index 474e395c10b6c..7bb086014c8f4 100644
--- a/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc
+++ b/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h"
 
+#include <glog/logging.h>
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 0ea8bb96566ab..35bda07cab67b 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -488,7 +488,7 @@ def unix_custom_single_compiler(
                         cflags.append('-DPADDLE_WITH_CUDA')
 
                 add_std_without_repeat(
-                    cflags, self.compiler.compiler_type, use_std14=True
+                    cflags, self.compiler.compiler_type, use_std17=True
                 )
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 55a9a2e993f31..009176f61fe80 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -418,13 +418,13 @@ def prepare_win_cudaflags(cflags):
     return cflags
 
 
-def add_std_without_repeat(cflags, compiler_type, use_std14=False):
+def add_std_without_repeat(cflags, compiler_type, use_std17=False):
     """
-    Append -std=c++11/14 in cflags if without specific it before.
+    Append -std=c++14/17 in cflags if without specific it before.
     """
     cpp_flag_prefix = '/std:' if compiler_type == 'msvc' else '-std='
     if not any(cpp_flag_prefix in flag for flag in cflags):
-        suffix = 'c++14' if use_std14 else 'c++11'
+        suffix = 'c++17' if use_std17 else 'c++14'
         cpp_flag = cpp_flag_prefix + suffix
         cflags.append(cpp_flag)
 
diff --git a/python/setup.py.in b/python/setup.py.in
index f140b66bd1c44..9fd352ddd26be 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -874,7 +874,13 @@ headers = (
     # utils api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) + # paddle utils headers
     # init headers
-    list(find_files('init_phi.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')))  # phi init headers
+    list(find_files('init_phi.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +  # phi init headers
+    # init headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pir/include')) +  # pir init headers
+    # init headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/pir/drr/include')) +  # drr init headers
+    # init headers
+    list(find_files('transform_general_functions.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/pir/transforms')))  # pass utils init headers
 
 jit_layer_headers = ['layer.h', 'serializer.h', 'serializer_utils.h', 'all.h', 'function.h']
 for f in jit_layer_headers:
diff --git a/setup.py b/setup.py
index 215f767b73d53..2601cfe7b11b3 100644
--- a/setup.py
+++ b/setup.py
@@ -1370,6 +1370,27 @@ def get_headers():
                 recursive=True,
             )
         )
+        + list(  # pir init headers
+            find_files(
+                '*.h',
+                paddle_source_dir + '/paddle/pir/include',
+                recursive=True,
+            )
+        )
+        + list(  # drr init headers
+            find_files(
+                '*.h',
+                paddle_source_dir + '/paddle/fluid/pir/drr/include',
+                recursive=True,
+            )
+        )
+        + list(  # pass utils init headers
+            find_files(
+                'transform_general_functions.h',
+                paddle_source_dir + '/paddle/fluid/pir/transforms',
+                recursive=True,
+            )
+        )
     )
 
     jit_layer_headers = [
diff --git a/test/cpp/pir/tools/test_op.h b/test/cpp/pir/tools/test_op.h
index 1f61f0ff001ba..31fc4445c36ee 100644
--- a/test/cpp/pir/tools/test_op.h
+++ b/test/cpp/pir/tools/test_op.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <glog/logging.h>
+
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_type.h"
 #include "paddle/pir/include/core/op_base.h"

From 4ee55da3426a40e607a1f9615a0f10040c48e4e0 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:37:37 +0800
Subject: [PATCH 03/55] Revert "cinn (#62177)" (#62221)

This reverts commit ee2e49a95365732442df8c7de37436166bad102f.
---
 paddle/scripts/paddle_build.sh    |  3 ---
 tools/coverage/paddle_coverage.sh | 31 -------------------------------
 2 files changed, 34 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 19e9cf3803a84..71ee30a115ef7 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -4235,9 +4235,6 @@ function main() {
         ;;
       test)
         parallel_test
-        if [ "${WITH_CINN}" == "ON" ] ; then
-            check_coverage
-        fi
         ;;
       single_test)
         single_test $2
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index 90e02715876ca..ee2a38f5da851 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -39,28 +39,6 @@ lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
 # full html report
 
-function gen_full_html_report_cinn(){
-        lcov --extract coverage.info \
-        '/paddle/paddle/cinn/adt/*' \
-        '/paddle/paddle/cinn/api/*' \
-        '/paddle/paddle/cinn/ast_gen_ius/*' \
-        '/paddle/paddle/cinn/auto_schedule/*' \
-        '/paddle/paddle/cinn/backends/*' \
-        '/paddle/paddle/cinn/common/*' \
-        '/paddle/paddle/cinn/frontend/*' \
-        '/paddle/paddle/cinn/hlir/*' \
-        '/paddle/paddle/cinn/ir/*' \
-        '/paddle/paddle/cinn/lang/*' \
-        '/paddle/paddle/cinn/optim/*' \
-        '/paddle/paddle/cinn/poly/*' \
-        '/paddle/paddle/cinn/pybind/*' \
-        '/paddle/paddle/cinn/runtime/*' \
-        '/paddle/paddle/cinn/utils/*' \
-        -o coverage-full.tmp \
-        --rc lcov_branch_coverage=0
-}
-
-
 function gen_full_html_report() {
     lcov --extract coverage.info \
         '/paddle/paddle/fluid/framework/*' \
@@ -142,12 +120,6 @@ else
     gen_full_html_report || true
 fi
 
-if [ ${WITH_CINN:-OFF} == "ON" ]; then
-    gen_full_html_report_cinn || true
-else
-    gen_full_html_report || true
-fi
-
 # diff html report
 
 function gen_diff_html_report() {
@@ -250,8 +222,5 @@ fi
 
 if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
     echo "exit 9" > /tmp/paddle_coverage.result
-    if [ "${WITH_CINN}" == "ON" ]; then
-        echo "You must one RD(liuhongyu or lanxiang or zhenghuihuang or tianchao zhangliujie)to approval this PR."
-    fi
     exit 9
 fi

From f1e3179b95b7de66baf09765c97ceaa7dc590547 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 29 Feb 2024 14:45:52 +0800
Subject: [PATCH 04/55] [PIR] refine pir add_n and pir onednn support add_n
 (#62024)

* pir onednn support add_n
---
 .../ir_adaptor/translator/op_translator.cc    |  20 +-
 .../fluid/pir/dialect/op_generator/op_gen.py  |   1 -
 .../pir/dialect/op_generator/ops_api_gen.py   |   1 -
 .../pir/dialect/operator/ir/manual_op.cc      | 194 +-----------------
 .../fluid/pir/dialect/operator/ir/manual_op.h |  24 ---
 .../fluid/pir/dialect/operator/ir/onednn.yaml |  10 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |  21 +-
 .../dialect/operator/ir/ops_onednn_extra.yaml |   3 +-
 .../pir/transforms/pd_op_to_kernel_pass.cc    |   2 +-
 test/mkldnn/test_sum_bf16_mkldnn_op.py        |   2 +-
 test/mkldnn/test_sum_mkldnn_op.py             |   6 +-
 11 files changed, 34 insertions(+), 250 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 6e1ec454b6bab..1c75d198ef07d 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1355,13 +1355,21 @@ struct ShadowOutputOpTranscriber : public OpTranscriber {
 struct AddNOpTranscriber : public OpTranscriber {
   pir::OpInfo LookUpOpInfo(pir::IrContext* ctx,
                            const OpDesc& op_desc) override {
-    std::string target_op_name =
-        GetPrefix(ctx, op_desc) + OpNameCompatibleMapping(op_desc.Type());
-    if (IsInplace(op_desc)) {
-      target_op_name += "_";
-    } else {
-      target_op_name += "_with_kernel";
+    auto prefix = GetPrefix(ctx, op_desc);
+    std::string target_op_name;
+#ifdef PADDLE_WITH_DNNL
+    if (prefix == kOneDNNTargetDialectPrefix) {
+      target_op_name = std::string(kOneDNNTargetDialectPrefix) + "add_n_onednn";
+    } else  // NOLINT
+#endif
+    {
+      target_op_name =
+          GetPrefix(ctx, op_desc) + OpNameCompatibleMapping(op_desc.Type());
+      if (IsInplace(op_desc)) {
+        target_op_name += "_";
+      }
     }
+
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW("Op add_n should have corresponding OpInfo %s", target_op_name);
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 67462983fbf0a..5513bbb3f5552 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -312,7 +312,6 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 PD_MANUAL_OP_LIST = {
     'add_n',
     'add_n_',
-    'add_n_with_kernel',
     'split_grad',
     'expand',
     'increment',
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 54b56a2e3c887..534ea49a61f45 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -118,7 +118,6 @@
 
 NO_NEED_GEN_STATIC_ONLY_APIS = [
     'add_n_',
-    'add_n_with_kernel',
     'c_allgather',
     'c_allreduce_max',
     'c_allreduce_min',
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 0863737842ba2..ec61f6c7dd88d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 #ifdef GET_OP_LIST
 #undef GET_OP_LIST
-paddle::dialect::AddNOp, paddle::dialect::AddN_Op,
-    paddle::dialect::AddNWithKernelOp, paddle::dialect::AddNArrayOp,
+paddle::dialect::AddNOp, paddle::dialect::AddN_Op, paddle::dialect::AddNArrayOp,
     paddle::dialect::FusedGemmEpilogueOp, paddle::dialect::AssignOut_Op,
     paddle::dialect::FusedGemmEpilogueGradOp, paddle::dialect::SplitGradOp,
     paddle::dialect::ExpandOp, paddle::dialect::CreateArrayOp,
@@ -372,196 +371,6 @@ std::vector<pir::Type> AddN_Op::InferMeta(
   return argument_outputs;
 }
 
-OpInfoTuple AddNWithKernelOp::GetOpInfo() {
-  std::vector<paddle::dialect::OpInputInfo> inputs = {
-      paddle::dialect::OpInputInfo(
-          "inputs",
-          "pir::VectorType<paddle::dialect::DenseTensorType>",
-          false,
-          false,
-          false,
-          true)};
-  std::vector<paddle::dialect::OpAttributeInfo> attributes = {};
-  std::vector<paddle::dialect::OpOutputInfo> outputs = {
-      paddle::dialect::OpOutputInfo(
-          "out", "paddle::dialect::DenseTensorType", false, false)};
-  paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo(
-      "AddNInferMeta", {"inputs"}, "add_n", {"inputs"}, {}, {}, {}, {});
-  return std::make_tuple(
-      inputs, attributes, outputs, run_time_info, "add_n_with_kernel");
-}
-
-void AddNWithKernelOp::Build(pir::Builder &builder,
-                             pir::OperationArgument &argument,
-                             pir::Value inputs_) {
-  VLOG(4) << "Start build AddNWithKernelOp";
-
-  VLOG(4) << "Builder construction inputs";
-  std::vector<pir::Value> argument_inputs = {inputs_};
-  argument.AddInput(inputs_);
-
-  VLOG(4) << "Builder construction attributes";
-  pir::AttributeMap argument_attributes = {};
-  std::vector<pir::Type> argument_outputs =
-      AddNWithKernelOp::InferMeta(argument_inputs, argument_attributes);
-
-  argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
-}
-
-void AddNWithKernelOp::VerifySig() {
-  VLOG(4) << "Start Verifying inputs, outputs and attributes for: "
-             "AddNWithKernelOp.";
-  VLOG(4) << "Verifying inputs:";
-  {
-    auto input_size = num_operands();
-    PADDLE_ENFORCE_EQ(
-        input_size,
-        1u,
-        phi::errors::PreconditionNotMet(
-            "The size %d of inputs must be equal to 1.", input_size));
-    if (auto vec_type =
-            (*this)->operand_source(0).type().dyn_cast<pir::VectorType>()) {
-      for (size_t i = 0; i < vec_type.size(); ++i) {
-        PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
-                           vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
-                       phi::errors::PreconditionNotMet(
-                           "Type validation failed for the 0th input."));
-      }
-    } else {
-      PADDLE_ENFORCE((*this)->operand_source(0)
-                             .type()
-                             .isa<paddle::dialect::DenseTensorType>() ||
-                         (*this)
-                             ->operand_source(0)
-                             .type()
-                             .isa<paddle::dialect::SelectedRowsType>(),
-                     phi::errors::PreconditionNotMet(
-                         "Type validation failed for the 0th input."));
-    }
-  }
-  VLOG(4) << "Verifying attributes:";
-  {
-    // Attributes num is 0, not need to check attributes type.
-  }
-  VLOG(4) << "Verifying outputs:";
-  {
-    auto output_size = num_results();
-    PADDLE_ENFORCE_EQ(
-        output_size,
-        1u,
-        phi::errors::PreconditionNotMet(
-            "The size %d of outputs must be equal to 1.", output_size));
-    PADDLE_ENFORCE(
-        (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>() ||
-            (*this)->result(0).type().isa<paddle::dialect::SelectedRowsType>(),
-        phi::errors::PreconditionNotMet(
-            "Type validation failed for the 0th output."));
-  }
-  VLOG(4) << "End Verifying for: AddNWithKernelOp.";
-}
-
-void AddNWithKernelOp::InferMeta(phi::InferMetaContext *infer_meta) {
-  auto fn = PD_INFER_META(phi::AddNInferMeta);
-  fn(infer_meta);
-}
-
-std::vector<pir::Type> AddNWithKernelOp::InferMeta(
-    const std::vector<pir::Value> &input_values,
-    const pir::AttributeMap &attributes) {
-  VLOG(4) << "Start infermeta AddNWithKernelOp";
-  IR_ENFORCE(input_values.size() == 1,
-             "Num of inputs is expected to be 1 but got %d.",
-             input_values.size());
-  pir::Value inputs_ = input_values[0];
-
-  VLOG(4) << "Builder construction outputs";
-  pir::VectorType inputs = inputs_.type().dyn_cast<pir::VectorType>();
-  std::vector<paddle::dialect::IrTensor> vec_dense_inputs;
-  for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
-    if (inputs[i].isa<paddle::dialect::DenseTensorType>()) {
-      vec_dense_inputs.push_back(paddle::dialect::IrTensor(
-          paddle::dialect::TransToPhiDataType(
-              inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
-          inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
-          inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
-          inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-          inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
-    } else if (inputs[i].isa<paddle::dialect::AllocatedDenseTensorType>()) {
-      vec_dense_inputs.push_back(paddle::dialect::IrTensor(
-          TransToPhiDataType(
-              inputs[i]
-                  .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-                  .dtype()),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .dims(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .data_layout(),
-          inputs[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>().lod(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .offset()));
-    } else if (inputs[i].isa<paddle::dialect::SelectedRowsType>()) {
-      vec_dense_inputs.push_back(paddle::dialect::IrTensor(
-          paddle::dialect::TransToPhiDataType(
-              inputs[i].dyn_cast<paddle::dialect::SelectedRowsType>().dtype()),
-          inputs[i].dyn_cast<paddle::dialect::SelectedRowsType>().dims(),
-          inputs[i].dyn_cast<paddle::dialect::SelectedRowsType>().data_layout(),
-          inputs[i].dyn_cast<paddle::dialect::SelectedRowsType>().lod(),
-          inputs[i].dyn_cast<paddle::dialect::SelectedRowsType>().offset()));
-    } else if (inputs[i].isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-      vec_dense_inputs.push_back(paddle::dialect::IrTensor(
-          TransToPhiDataType(
-              inputs[i]
-                  .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-                  .dtype()),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-              .dims(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-              .data_layout(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-              .lod(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-              .offset()));
-    } else {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Only support DenseTensorType or AllocatedDenseTensorType or "
-          "SelectedRowsType or AllocatedSelectedRowsType"));
-    }
-  }
-
-  std::vector<paddle::dialect::IrMetaTensor> vec_meta_inputs;
-  for (size_t i = 0; i < vec_dense_inputs.size(); i++) {
-    vec_meta_inputs.push_back(
-        paddle::dialect::IrMetaTensor(&vec_dense_inputs[i]));
-  }
-
-  std::vector<const phi::MetaTensor *> meta_inputs;
-  for (size_t i = 0; i < static_cast<size_t>(vec_meta_inputs.size()); i++) {
-    meta_inputs.push_back(&vec_meta_inputs[i]);
-  }
-  paddle::dialect::IrTensor dense_out;
-  paddle::dialect::IrMetaTensor meta_out(&dense_out);
-
-  phi::AddNInferMeta(meta_inputs, &meta_out, phi::MetaConfig(false, false));
-
-  std::vector<pir::Type> argument_outputs;
-  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      pir::IrContext::Instance(),
-      paddle::dialect::TransToIrDataType(dense_out.dtype()),
-      dense_out.dims(),
-      dense_out.layout(),
-      dense_out.lod(),
-      dense_out.offset());
-  argument_outputs.push_back(out_dense_tensor_type);
-  return argument_outputs;
-}
-
 OpInfoTuple AddNArrayOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
       OpInputInfo("inputs",
@@ -4701,7 +4510,6 @@ phi::DataType ArrayPopOp::GetKernelTypeForVar(
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNArrayOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AssignOut_Op)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
index ea836f68a4959..1f8be853ddcf5 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -90,29 +90,6 @@ class AddN_Op : public pir::Op<AddN_Op,
       const pir::AttributeMap &attributes);
 };
 
-class AddNWithKernelOp : public pir::Op<AddNWithKernelOp,
-                                        paddle::dialect::OpYamlInfoInterface,
-                                        paddle::dialect::InferMetaInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd_op.add_n_with_kernel"; }
-  static constexpr const char **attributes_name = nullptr;
-  static constexpr uint32_t attributes_num = 0;
-  static OpInfoTuple GetOpInfo();
-  static void Build(pir::Builder &builder,             // NOLINT
-                    pir::OperationArgument &argument,  // NOLINT
-                    pir::Value inputs_);
-
-  void VerifySig();
-  pir::Value inputs() { return operand_source(0); }
-  pir::Value out() { return result(0); }
-
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-  static std::vector<pir::Type> InferMeta(
-      const std::vector<pir::Value> &input_values,
-      const pir::AttributeMap &attributes);
-};
-
 class AddNArrayOp : public pir::Op<AddNArrayOp,
                                    paddle::dialect::OpYamlInfoInterface,
                                    paddle::dialect::InferMetaInterface> {
@@ -818,7 +795,6 @@ class ArrayPopOp : public pir::Op<ArrayPopOp,
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNArrayOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AssignOut_Op)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
index 1ef15ccb9c3a3..a786f395db1af 100644
--- a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
@@ -1,3 +1,13 @@
+- op : add_n_onednn
+  args : (Tensor[] inputs)
+  output : Tensor(out)
+  infer_meta:
+    func: AddNInferMeta
+    param: [inputs]
+  kernel:
+    func: add_n
+    param: [inputs]
+
 - op : dequantize
   args : (Tensor input, float scale=1.0, float shift=0.0)
   output : Tensor(output)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 5c163637450c3..22bae4a65ab9a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -28,32 +28,13 @@
     support_trans_dtype : x, y
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+# this add_n is only for ops_api_gen.py
 - op : add_n
   args : (Tensor[] inputs)
   output : Tensor
   invoke : add_n_impl(inputs)
   backward : add_n_grad
 
-- op : add_n_
-  args : (Tensor[] inputs)
-  output : Tensor(out)
-  infer_meta:
-    func: AddNInferMeta
-    param: [inputs]
-  kernel:
-    func: add_n
-    param: [inputs]
-
-- op : add_n_with_kernel
-  args : (Tensor[] inputs)
-  output : Tensor(out)
-  infer_meta:
-    func: AddNInferMeta
-    param: [inputs]
-  kernel:
-    func: add_n
-    param: [inputs]
-
 - op : all
   args : (Tensor x, int64_t[] axis={}, bool keepdim=false)
   output : Tensor(out)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index 5af2b7e13d0d8..e85e39621ee9d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -15,7 +15,8 @@
 
 - op : abs_grad
 
-# - op : add_n
+- op : add_n_onednn
+  extra_args : str mkldnn_data_type="float32"
 
 - op : batch_norm
   extra_args : bool fuse_with_relu=false
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 3450140741e21..c05e5de0daafa 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -812,7 +812,7 @@ std::string GetKernelName(const OpYamlInfoParser* op_info_parser,
     kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func;
   }
 
-  if (op_item->isa<AddN_Op>() || op_item->isa<AddNWithKernelOp>()) {
+  if (op_item->isa<AddN_Op>() || op_item->isa<AddNOp>()) {
     if (op_item->result(0).type().isa<SelectedRowsType>()) {
       kernel_fn_str = "add_n_sr";
     }
diff --git a/test/mkldnn/test_sum_bf16_mkldnn_op.py b/test/mkldnn/test_sum_bf16_mkldnn_op.py
index 8fbef74e38d2d..c59fa0d7b8359 100644
--- a/test/mkldnn/test_sum_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_sum_bf16_mkldnn_op.py
@@ -48,7 +48,7 @@ def setUp(self):
         self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
 
     def test_check_grad(self):
         pass
diff --git a/test/mkldnn/test_sum_mkldnn_op.py b/test/mkldnn/test_sum_mkldnn_op.py
index 6750f1a79c7ce..fc86c6834b940 100644
--- a/test/mkldnn/test_sum_mkldnn_op.py
+++ b/test/mkldnn/test_sum_mkldnn_op.py
@@ -39,11 +39,13 @@ def init_data_type(self):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=False)
+        self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def test_check_grad(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(['x0'], 'Out', check_dygraph=False)
+        self.check_grad(
+            ['x0'], 'Out', check_dygraph=False, check_pir_onednn=True
+        )
 
 
 class TestMKLDNNSumInplaceOp(unittest.TestCase):

From ba71b838d694912576e3d3512ff15b737fa4c73c Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 29 Feb 2024 15:28:45 +0800
Subject: [PATCH 05/55] fix (#62216)

---
 paddle/fluid/ir_adaptor/translator/program_translator.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index 608d24a60b577..e40da8a7b8fb6 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -309,7 +309,7 @@ void ProgramTranslator::TranslateIfOperation(
     TranslationContext* translation_ctx,
     pir::Block* dst_block,
     bool for_bwd) {
-  VLOG(8) << "=============>Start to translate if op:" << op;
+  LOG_FIRST_N(INFO, 1) << "Translate ConditionalBlockOp";
   auto& type_translator = TypeTranslator::instance();
 
   auto cond_op_cond = op->Input("Cond")[0];
@@ -479,7 +479,7 @@ void ProgramTranslator::TranslateWhileOperation(
     const OpDesc* op,
     TranslationContext* translation_ctx,
     pir::Block* dst_block) {
-  VLOG(8) << "=============>Start to translate while op:" << op;
+  LOG_FIRST_N(INFO, 1) << "Translate WhileOp";
   auto& sub_block = legacy_program_->Block(op->GetBlockAttrId("sub_block"));
   auto& inputs = op->Output("Out");
   auto& cond_var = op->Input("Condition")[0];

From 4865fed1cd3f56dfffd5388bc4152bc64dc7dba3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Thu, 29 Feb 2024 16:50:24 +0800
Subject: [PATCH 06/55] Delete useless test files (#62209)

* Update CMakeLists.txt

* mv cc file

* add TEST_API

* delete use_op_itself

* Update test_reference_count_pass_last_lived_ops.cc

* Update CMakeLists.txt

* Delete paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc

* Delete paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
---
 .../share_varinfo_into_cinn_pass_test.cc      | 154 ------------
 ...est_reference_count_pass_last_lived_ops.cc | 228 ------------------
 2 files changed, 382 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
 delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc

diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
deleted file mode 100644
index 1f78e293a21a3..0000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-USE_OP_ITSELF(mul);
-USE_OP_ITSELF(elementwise_add);
-
-USE_OP_ITSELF(cinn_launch);
-PD_DECLARE_KERNEL(cinn_launch, CPU, ALL_LAYOUT);
-#ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(cinn_launch, GPU, ALL_LAYOUT);
-#endif
-
-namespace paddle::framework {
-
-using Name2VarInfoMap =
-    std::unordered_map<std::string, std::shared_ptr<ir::MemOptVarInfo>>;
-
-static ProgramDesc BuildProgramInsideCinnLaunchOp() {
-  ProgramDesc program;
-  auto* block = program.MutableBlock(0);
-  block->Var("var1");
-  block->Var("var2");
-  block->Var("var3");
-  block->Var("var4");
-  block->Var("var5");
-
-  auto add_op =
-      std::unique_ptr<OpDesc>(new OpDesc("elementwise_add",
-                                         {{"X", {"var1"}}, {"Y", {"var2"}}},
-                                         {{"Out", {"var3"}}},
-                                         {}));
-  block->AppendAllocatedOp(std::move(add_op));
-  auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
-      "mul", {{"X", {"var3"}}, {"Y", {"var4"}}}, {{"Out", {"var5"}}}, {}));
-  block->AppendAllocatedOp(std::move(mul_op));
-  return program;
-}
-
-static ProgramDesc BuildProgramWithCinnLaunchOp(int64_t compilation_key) {
-  // create a cinn_launch op
-  ProgramDesc program;
-  auto* block = program.MutableBlock(0);
-  block->Var("var1");
-  block->Var("var2");
-  block->Var("var4");
-  block->Var("var5");
-
-  auto cinn_launch_op = std::unique_ptr<OpDesc>(
-      new OpDesc("cinn_launch",
-                 {{"X", {"var1", "var2", "var4"}}},
-                 {{"Out", {"var5"}}},
-                 {{"compilation_key", compilation_key}}));
-  block->AppendAllocatedOp(std::move(cinn_launch_op));
-  return program;
-}
-
-struct TestPassContext {
-  explicit TestPassContext(const ProgramDesc& program) {
-    graph = std::make_unique<ir::Graph>(program);
-    details::BuildStrategy build_strategy;
-    details::ExecutionStrategy exec_strategy;
-    exec_strategy.use_device_ = paddle::platform::kCUDA;
-    executor.reset(new ParallelExecutor(platform::CUDAPlace(0),
-                                        &scope,
-                                        exec_strategy,
-                                        build_strategy,
-                                        graph.get()));
-  }
-
-  Scope scope;
-  std::unique_ptr<ir::Graph> graph;
-  std::unique_ptr<ParallelExecutor> executor;
-};
-
-TEST(ShareMemInfoToSubGraphPassTest, test_main_graph_share_varinfo) {
-  // add a subgraph to CinnCompiler
-  auto subgraph = std::make_unique<ir::Graph>(BuildProgramInsideCinnLaunchOp());
-  subgraph->GetOrInit<Name2VarInfoMap>(
-      paddle2cinn::kMemOptVarInfoFromMainGraph);
-  auto compilation_key =
-      paddle2cinn::CinnCompiler::GetInstance()->AddGraph(std::move(subgraph));
-
-  // build test data and apply pass
-  auto context = std::make_unique<TestPassContext>(
-      BuildProgramWithCinnLaunchOp(compilation_key));
-
-  // check result
-  const ir::Graph& result_subgraph =
-      paddle2cinn::CinnCompiler::GetInstance()->FindGraph(compilation_key);
-  const auto& dst_varinfo_map = result_subgraph.Get<Name2VarInfoMap>(
-      paddle2cinn::kMemOptVarInfoFromMainGraph);
-  ASSERT_EQ(dst_varinfo_map.size(), 4);
-  EXPECT_EQ(dst_varinfo_map.count("var1"), 1);
-  EXPECT_EQ(dst_varinfo_map.count("var5"), 1);
-  EXPECT_EQ(dst_varinfo_map.at("var1").use_count(), 2);
-  EXPECT_EQ(dst_varinfo_map.at("var5").use_count(), 2);
-}
-
-TEST(ShareMemInfoToSubGraphPassTest, test_subgraph_take_varinfo) {
-  // build test data and apply pass
-  auto context =
-      std::make_unique<TestPassContext>(BuildProgramInsideCinnLaunchOp());
-  auto& varinfo_map_shared = context->graph->GetOrInit<Name2VarInfoMap>(
-      paddle2cinn::kMemOptVarInfoFromMainGraph);
-  varinfo_map_shared = {
-      {"var1", std::make_shared<ir::MemOptVarInfo>("var1", 1)},
-      {"var2", std::make_shared<ir::MemOptVarInfo>("var2", 2)},
-  };
-
-  ir::MemOptVarInfoMapList varinfo_maps(1);
-  auto& dst_varinfo_map = varinfo_maps.front();
-  dst_varinfo_map = {{"var1", std::make_shared<ir::MemOptVarInfo>("var1", 1)},
-                     {"var2", std::make_shared<ir::MemOptVarInfo>("var2", 1)},
-                     {"var3", std::make_shared<ir::MemOptVarInfo>("var3", 1)},
-                     {"var4", std::make_shared<ir::MemOptVarInfo>("var4", 1)},
-                     {"var5", std::make_shared<ir::MemOptVarInfo>("var5", 1)}};
-  auto share_pass =
-      ir::PassRegistry::Instance().Get("share_varinfo_into_cinn_pass");
-  share_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &varinfo_maps);
-  share_pass->Apply(context->graph.get());
-
-  // check result
-  ASSERT_NE(dst_varinfo_map.at("var1")->ParentHolder(), nullptr);
-  ASSERT_NE(dst_varinfo_map.at("var2")->ParentHolder(), nullptr);
-  ASSERT_EQ(dst_varinfo_map.at("var3")->ParentHolder(), nullptr);
-  ASSERT_EQ(dst_varinfo_map.at("var4")->ParentHolder(), nullptr);
-  ASSERT_EQ(dst_varinfo_map.at("var5")->ParentHolder(), nullptr);
-}
-
-}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
deleted file mode 100644
index eeec6fd8788d4..0000000000000
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gtest/gtest.h"
-#include "paddle/common/flags.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-COMMON_DECLARE_double(eager_delete_tensor_gb);
-
-namespace paddle {
-namespace framework {
-namespace p = paddle::platform;
-
-static std::vector<platform::Place> CreatePlaces(size_t num, bool use_cuda) {
-  std::vector<platform::Place> result;
-  result.reserve(num);
-  for (size_t i = 0; i < num; ++i) {
-    if (use_cuda) {
-      result.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
-    } else {
-      result.emplace_back(platform::CPUPlace());
-    }
-  }
-  return result;
-}
-
-static void NewVar(BlockDesc *block,
-                   const std::string &name,
-                   const std::vector<int64_t> &shape) {
-  auto *var_desc = block->Var(name);
-  var_desc->SetShape(shape);
-}
-
-static void AppendOp(BlockDesc *block,
-                     const std::string &type,
-                     VariableNameMap inputs,
-                     VariableNameMap outputs,
-                     AttributeMap attrs) {
-  auto &op_info = OpInfoMap::Instance().Get(type);
-  if (op_info.Checker()) {
-    op_info.Checker()->Check(&attrs);
-  }
-
-  auto *op = block->AppendOp();
-  op->SetType(type);
-  for (auto &pair : inputs) {
-    op->SetInput(pair.first, pair.second);
-  }
-
-  for (auto &pair : outputs) {
-    op->SetOutput(pair.first, pair.second);
-    for (auto &var_name : pair.second) {
-      if (!block->FindVarRecursive(var_name)) {
-        NewVar(block, var_name, {});
-      }
-    }
-  }
-
-  op->SetAttrMap(attrs);
-  op->InferVarType(block);
-  op->InferShape(*block);
-}
-
-class ReferenceCountPassTestHelper {
- public:
-  ReferenceCountPassTestHelper(const ProgramDesc &program, bool use_cuda)
-      : graph_(program) {
-    details::BuildStrategy build_strategy;
-    build_strategy.enable_inplace_ = false;
-    build_strategy.memory_optimize_ = false;
-    FLAGS_eager_delete_tensor_gb = -1;
-
-    details::ExecutionStrategy exec_strategy;
-    exec_strategy.use_device_ = use_cuda ? p::kCUDA : p::kCPU;
-
-    executor_ = std::make_unique<ParallelExecutor>(CreatePlaces(1, use_cuda),
-                                                   std::vector<std::string>(),
-                                                   "",
-                                                   &scope_,
-                                                   std::vector<Scope *>(),
-                                                   exec_strategy,
-                                                   build_strategy,
-                                                   &graph_);
-
-    auto ref_cnt_pass =
-        ir::PassRegistry::Instance().Get("reference_count_pass");
-    ref_cnt_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
-    ref_cnt_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars_);
-    ref_cnt_pass->Apply(&const_cast<ir::Graph &>(executor_->Graph()));
-  }
-
-  bool IsLastLivedOps(const std::string &name,
-                      std::vector<std::string> ops) const {
-    std::sort(ops.begin(), ops.end());
-    return LastLivedOpTypes(name) == ops;
-  }
-
-  std::vector<OperatorBase *> LastLivedOps(const std::string &name) const {
-    auto &ops = last_live_ops_of_vars_[0].at(name).ops();
-    std::vector<OperatorBase *> ret;
-    ret.reserve(ops.size());
-    for (auto *op : ops) {
-      ret.emplace_back(op->GetOp());
-    }
-    return ret;
-  }
-
- private:
-  std::vector<std::string> LastLivedOpTypes(const std::string &name) const {
-    auto iter = last_live_ops_of_vars_[0].find(name);
-    std::vector<std::string> ret;
-    if (iter != last_live_ops_of_vars_[0].end()) {
-      for (auto *op : iter->second.ops()) {
-        ret.emplace_back(op->GetOp()->Type());
-      }
-    }
-    std::sort(ret.begin(), ret.end());
-    return ret;
-  }
-
- private:
-  ir::Graph graph_;
-  Scope scope_;
-  std::unique_ptr<ParallelExecutor> executor_;
-
-  ir::MemOptVarInfoMapList mem_opt_var_infos_;
-  std::vector<ir::LastLiveOpsOfVars> last_live_ops_of_vars_;
-};
-
-TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
-  ProgramDesc program;
-  auto *block = program.MutableBlock(0);
-  std::vector<int64_t> shape{{3, 4, 5}};
-
-  /**
-   * The network is:
-   *
-   * x0 = fluid.layer.data(...)
-   * x1 = scale(x0, scale=1)
-   * x2 = scale(x1, scale=2)
-   * x3 = elementwise_mul(x1, x2)
-   * scale(x3, out=x1, scale=3) # produce a new version of x1
-   * x4, x5 = elementwise_add_grad(dout=x3, x=x2, y=x1)
-   * x6 = elementwise_mul(x4, x5)
-   * x7 = elementwise_add(x5, x5)
-   */
-  std::string x0 = "x0";
-  std::string x1 = "x1";
-  std::string x2 = "x2";
-  std::string x3 = "x3";
-  std::string x4 = "x4";
-  std::string x5 = "x5";
-  std::string x6 = "x6";
-  std::string x7 = "x7";
-
-  NewVar(block, x0, shape);
-  AppendOp(block, "scale", {{"X", {x0}}}, {{"Out", {x1}}}, {{"scale", 1.0f}});
-  AppendOp(block, "scale", {{"X", {x1}}}, {{"Out", {x2}}}, {{"scale", 2.0f}});
-  AppendOp(block,
-           "elementwise_mul",
-           {{"X", {x1}}, {"Y", {x2}}},
-           {{"Out", {x3}}},
-           {});
-  AppendOp(block, "scale", {{"X", {x3}}}, {{"Out", {x1}}}, {{"scale", 3.0f}});
-  AppendOp(block,
-           "elementwise_add_grad",
-           {{GradVarName("Out"), {x3}}, {"X", {x2}}, {"Y", {x1}}},
-           {{GradVarName("X"), {x4}}, {GradVarName("Y"), {x5}}},
-           {});
-  AppendOp(block,
-           "elementwise_mul",
-           {{"X", {x4}}, {"Y", {x5}}},
-           {{"Out", {x6}}},
-           {});
-  AppendOp(block,
-           "elementwise_add",
-           {{"X", {x5}}, {"Y", {x5}}},
-           {{"Out", {x7}}},
-           {});
-
-  std::vector<bool> use_cuda_list{false};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  use_cuda_list.push_back(true);
-#endif
-  for (auto use_cuda : use_cuda_list) {
-    ReferenceCountPassTestHelper helper(program, use_cuda);
-    ASSERT_TRUE(helper.IsLastLivedOps(x0, {"scale"}));
-    ASSERT_EQ(PADDLE_GET_CONST(float,
-                               helper.LastLivedOps(x0)[0]->Attrs().at("scale")),
-              1.0f);
-
-    ASSERT_TRUE(helper.IsLastLivedOps(x1, {"scale"}));
-    ASSERT_EQ(PADDLE_GET_CONST(float,
-                               helper.LastLivedOps(x1)[0]->Attrs().at("scale")),
-              3.0f);
-
-    ASSERT_TRUE(helper.IsLastLivedOps(x2, {"elementwise_mul"}));
-    ASSERT_TRUE(helper.IsLastLivedOps(x3, {"elementwise_add_grad"}));
-
-    ASSERT_TRUE(helper.IsLastLivedOps(x4, {"elementwise_mul"}));
-    ASSERT_TRUE(
-        helper.IsLastLivedOps(x5, {"elementwise_mul", "elementwise_add"}));
-
-    ASSERT_TRUE(helper.IsLastLivedOps(x6, {"elementwise_mul"}));
-    ASSERT_TRUE(helper.IsLastLivedOps(x7, {"elementwise_add"}));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle

From 4448d45cafa17d085368550f836a1e0396d2b4d0 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 29 Feb 2024 16:55:24 +0800
Subject: [PATCH 07/55] [CINN]update dyshape workflow (#62101)

* update dyshape workflow

* update

* polish code

* poslish code

* fix compiler bug
---
 .../operator/transforms/add_cinn_pass.cc      |  2 +-
 .../transforms/dynamic_reshape_pass.cc        |  2 +-
 .../transforms/replace_dynamic_expand_pass.cc | 25 +++++++++++++++++--
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index 496370ee7bfcd..24c05b6b006c3 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -107,9 +107,9 @@ void ApplyCinnPreprocessPass(
 
   pass_manager->AddPass(
       cinn::dialect::ir::CreateMoveGenerateShapeOpsToProloguePass());
-  pass_manager->AddPass(cinn::dialect::ir::CreateDivideGroupOpToFusionOpPass());
   pass_manager->AddPass(cinn::dialect::ir::CreateDynamicReshapeOpPass());
   pass_manager->AddPass(cinn::dialect::ir::CreateReplaceDynamicExpandOpPass());
+  pass_manager->AddPass(cinn::dialect::ir::CreateDivideGroupOpToFusionOpPass());
   pass_manager->AddPass(pir::CreateDeadCodeEliminationPass());
 
   pass_manager->Run(program);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
index cab96a8bd27f9..60c9edca4fb3c 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
@@ -118,7 +118,7 @@ class DynamicReshapeOpPass : public pir::Pass {
     for (uint32_t i = 0; i < op->num_regions(); ++i) {
       for (auto& block : op->region(i)) {
         for (auto& op : block) {
-          if (op.isa<cinn::dialect::FusionOp>()) {
+          if (op.isa<cinn::dialect::GroupOp>()) {
             auto [_, num_rewrites] =
                 pir::ApplyPatternsGreedily(&op, patterns_, cfg);
             AddStatistics(num_rewrites);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/replace_dynamic_expand_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/replace_dynamic_expand_pass.cc
index b37ab970da882..85bdf3985c8a5 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/replace_dynamic_expand_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/replace_dynamic_expand_pass.cc
@@ -52,7 +52,28 @@ class DynamicExpandOpPattern
       for (size_t i = 0; i < x_rank; ++i) {
         broadcast_axes[i] = i + index_gap;
       }
-      std::vector<int64_t> out_shape(out_rank, -1);
+
+      pir::ShapeConstraintIRAnalysis& shape_analysis =
+          pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
+
+      const auto& UpdateOutputShapeByDimExpr = [&]() -> std::vector<int64_t> {
+        std::vector<int64_t> out_shape(out_rank, -1);
+        if (shape_analysis.HasShapeOrDataForValue(op->result(0))) {
+          VLOG(3) << "found shape dialect";
+          auto shape_info =
+              shape_analysis.GetShapeOrDataForValue(op->result(0)).shape();
+
+          for (size_t i = 0; i < shape_info.size(); ++i) {
+            if (shape_info[i].isa<int64_t>()) {
+              out_shape[i] = shape_info[i].Get<int64_t>();
+            }
+          }
+        }
+        return out_shape;
+      };
+
+      auto out_shape = UpdateOutputShapeByDimExpr();
+
       return rewriter.Build<cinn::dialect::BroadcastOp>(
           op->operand_source(0), broadcast_axes, out_shape);
     }();
@@ -91,7 +112,7 @@ class ReplaceDynamicExpandOpPass : public pir::Pass {
     for (uint32_t i = 0; i < op->num_regions(); ++i) {
       for (auto& block : op->region(i)) {
         for (auto& op : block) {
-          if (op.isa<cinn::dialect::FusionOp>()) {
+          if (op.isa<cinn::dialect::GroupOp>()) {
             const auto& [_, num_rewrites] =
                 pir::ApplyPatternsGreedily(&op, patterns_, cfg);
             AddStatistics(num_rewrites);

From 473f7ba0a218df3691f261005447a9139b649e70 Mon Sep 17 00:00:00 2001
From: diadestiny <44188454+diadestiny@users.noreply.github.com>
Date: Thu, 29 Feb 2024 17:18:09 +0800
Subject: [PATCH 08/55] [SOT][3.12] fix codegen out of range about generating
 `LOAD_ATTR` in Python 3.12 (#62176)

---
 .../jit/sot/opcode_translator/executor/pycode_generator.py  | 6 +++++-
 test/sot/skip_files_py312                                   | 1 -
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py b/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
index 2ada3f7228f11..ce25cabd6f2d4 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
@@ -742,12 +742,14 @@ def gen_load_deref(self, name):
             idx = self.cell_free_storage.index(name)
         return self.add_instr("LOAD_DEREF", arg=idx, argval=name)
 
-    def gen_load_attr(self, name: str):
+    def gen_load_attr(self, name: str, is_method=False):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
         if sys.version_info >= (3, 12):
             idx <<= 1
+            if is_method:
+                idx |= 1
         return self.add_instr("LOAD_ATTR", arg=idx, argval=name)
 
     def gen_store_attr(self, name: str):
@@ -763,6 +765,8 @@ def gen_delete_attr(self, name: str):
         return self.add_instr("DELETE_ATTR", arg=idx, argval=name)
 
     def gen_load_method(self, name: str):
+        if sys.version_info >= (3, 12):
+            return self.gen_load_attr(name, True)
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
diff --git a/test/sot/skip_files_py312 b/test/sot/skip_files_py312
index 796fdb62e5001..4d3ee9050ad6c 100644
--- a/test/sot/skip_files_py312
+++ b/test/sot/skip_files_py312
@@ -1,6 +1,5 @@
 ./test_11_jumps.py
 ./test_12_for_loop.py
-./test_21_global.py
 ./test_builtin_zip.py
 ./test_inplace_api.py
 ./test_min_graph_size.py

From 18ea0edb5b1f1a5048efdfe9047e218f02bf5b53 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 29 Feb 2024 18:56:45 +0800
Subject: [PATCH 09/55] pir onednn support slice,stack (#62220)

---
 .../fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml   | 8 +++++---
 test/mkldnn/test_slice_mkldnn_op.py                       | 7 ++++---
 test/mkldnn/test_stack_mkldnn_op.py                       | 2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index e85e39621ee9d..b2e5cc7000f87 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -248,9 +248,11 @@
 
 - op : sigmoid_grad
 
-# - op : slice
+- op : slice
+  extra_args : str mkldnn_data_type="float32"
 
-# - op : slice_grad
+- op : slice_grad
+  extra_args : str mkldnn_data_type="float32"
 
 - op : softmax
   extra_args : str data_format="AnyLayout", str mkldnn_data_type="float32", bool is_test=false
@@ -276,7 +278,7 @@
 - op : squeeze_grad
   extra_args : str mkldnn_data_type="float32"
 
-# - op : stack
+- op : stack
 
 - op : subtract
 
diff --git a/test/mkldnn/test_slice_mkldnn_op.py b/test/mkldnn/test_slice_mkldnn_op.py
index 66161dbad4908..1a71278a9f216 100644
--- a/test/mkldnn/test_slice_mkldnn_op.py
+++ b/test/mkldnn/test_slice_mkldnn_op.py
@@ -55,10 +55,10 @@ def config(self):
         self.out = self.input[1:3, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir_onednn=True)
 
     def test_check_grad(self):
-        self.check_grad(['Input'], 'Out')
+        self.check_grad(['Input'], 'Out', check_pir_onednn=True)
 
 
 class TestSliceOneDNNOp1(TestSliceOneDNNOp):
@@ -217,7 +217,7 @@ def calculate_grads(self):
             ] = self.dout
 
         def test_check_output(self):
-            self.check_output_with_place(core.CPUPlace())
+            self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
 
         def test_check_grad(self):
             self.calculate_grads()
@@ -227,6 +227,7 @@ def test_check_grad(self):
                 "Out",
                 user_defined_grads=[self.dx],
                 user_defined_grad_outputs=[convert_float_to_uint16(self.dout)],
+                check_pir_onednn=True,
             )
 
     cls_name = "{}_{}".format(parent.__name__, "BF16")
diff --git a/test/mkldnn/test_stack_mkldnn_op.py b/test/mkldnn/test_stack_mkldnn_op.py
index 82acf285ce16d..8b91c246d6e6b 100644
--- a/test/mkldnn/test_stack_mkldnn_op.py
+++ b/test/mkldnn/test_stack_mkldnn_op.py
@@ -59,7 +59,7 @@ def setUp(self):
         self.attrs = {'axis': self.axis, 'use_mkldnn': True}
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
 
     # JUST FOR CI TO PASS, GRAD IS NOT IMPLEMENTED YET
     def test_check_grad(self):

From e0027d222284c148b50a7bde5f915676acdc7585 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 29 Feb 2024 19:05:52 +0800
Subject: [PATCH 10/55] [PIR] pir onednn support some fused ops (#62187)

* onednn support some fused ops
---
 .../pir_adaptor/pir_adaptor_util.cc           |   8 +-
 .../fused/mkldnn/fusion_lstm_mkldnn_op.cc     |  16 +-
 .../fluid/pir/dialect/operator/ir/onednn.yaml |  38 +++++
 .../dialect/operator/ir/ops_onednn_extra.yaml |  11 +-
 .../fluid/pir/dialect/operator/utils/utils.cc |   1 +
 paddle/phi/api/yaml/op_compat.yaml            |  38 +++++
 paddle/phi/infermeta/fusion.cc                | 160 ++++++++++++++++++
 paddle/phi/infermeta/fusion.h                 |  27 +++
 test/legacy_test/op_test.py                   |   8 +-
 test/legacy_test/test_fusion_lstm_op.py       |   4 +-
 .../mkldnn/test_fusion_lstm_bf16_mkldnn_op.py |   5 +-
 .../mkldnn/test_fusion_lstm_int8_mkldnn_op.py |   1 +
 test/mkldnn/test_fusion_lstm_mkldnn_op.py     |   7 +-
 test/white_list/op_accuracy_white_list.py     |   1 +
 14 files changed, 305 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index 1e2fa3269bb41..11b263f540500 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -951,27 +951,27 @@ std::shared_ptr<OperatorBase> BuildOperatorBase(
         }
         attr_map[legacy_arg_name] = vec_int;
       } else if (array_list[0].isa<pir::Int64Attribute>()) {
-        std::vector<int> vec_int64;
+        std::vector<int64_t> vec_int64;
         for (auto attribute : array_list) {
           vec_int64.push_back(
               attribute.dyn_cast<pir::Int64Attribute>().data());  // NOLINT
         }
         attr_map[legacy_arg_name] = vec_int64;
       } else if (array_list[0].isa<pir::BoolAttribute>()) {
-        std::vector<int> vec_bool;
+        std::vector<bool> vec_bool;
         for (auto attribute : array_list) {
           vec_bool.push_back(attribute.dyn_cast<pir::BoolAttribute>().data());
         }
         attr_map[legacy_arg_name] = vec_bool;
       } else if (array_list[0].isa<pir::FloatAttribute>()) {
-        std::vector<int> vec_float;
+        std::vector<float> vec_float;
         for (auto attribute : array_list) {
           vec_float.push_back(
               attribute.dyn_cast<pir::FloatAttribute>().data());  // NOLINT
         }
         attr_map[legacy_arg_name] = vec_float;
       } else if (array_list[0].isa<pir::DoubleAttribute>()) {
-        std::vector<int> vec_double;
+        std::vector<double> vec_double;
         for (auto attribute : array_list) {
           vec_double.push_back(
               attribute.dyn_cast<pir::DoubleAttribute>().data());  // NOLINT
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index ada14e280a0f3..e004b35d0c3ec 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -321,7 +321,7 @@ class LSTMMKLDNNHandler
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -473,9 +473,11 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(fusion_lstm,
-                   MKLDNN,
-                   phi::CPUPlace,
-                   ops::FusionLSTMMKLDNNKernel<float>,
-                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>,
-                   ops::FusionLSTMMKLDNNKernel<uint8_t>);
+
+PD_REGISTER_STRUCT_KERNEL(fusion_lstm,
+                          OneDNN,
+                          ONEDNN,
+                          ops::FusionLSTMMKLDNNKernel,
+                          float,
+                          uint8_t,
+                          paddle::platform::bfloat16) {}
diff --git a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
index a786f395db1af..18a799dfb28a9 100644
--- a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
@@ -74,6 +74,44 @@
   kernel :
     func : fused_elementwise_sub
 
+- op : fused_matmul
+  args : (Tensor x, Tensor y, Tensor residual_data, bool trans_x=false, bool trans_y=false, float matmul_alpha=1.0, str fuse_activation="", float fuse_alpha=0.0, float fuse_beta=0.0, float fused_output_scale=1.0, int[] fused_reshape_x={}, int[] fused_transpose_x={}, int[] fused_reshape_y={}, int[] fused_transpose_y={}, int[] fused_reshape_out={}, int[] fused_transpose_out={}, str mkldnn_data_type="float32", float scale_x=1.0, float scale_y=1.0, float scale_in_eltwise=0.0, float scale_out=1.0,bool force_fp32_output=false)
+  output : Tensor(out)
+  infer_meta :
+    func : FusedMatmulInferMeta
+  kernel :
+    func : fused_matmul
+  optional : residual_data
+
+- op : fused_softplus
+  args : (Tensor x, float beta=1.0, float threshold=20.0, str fuse_activation="", float fuse_alpha=0.0, float fuse_beta=0.0)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedExceptDtypeInferMeta
+    param : [x]
+  kernel :
+    func : fused_softplus
+
+- op : fused_transpose
+  args : (Tensor x, int[] axis={}, int[] fused_squeeze2_axes={}, int[] fused_unsqueeze2_axes={}, int[] fused_reshape2_shape={}, float scale=1.0, float shift=0.0, str output_data_type="")
+  output : Tensor(out)
+  infer_meta :
+    func : TransposeInferMeta
+    param : [x, axis]
+  kernel :
+    func : fused_transpose
+
+- op : fusion_lstm
+  args : (Tensor x, Tensor weight_x, Tensor weight_h, Tensor bias, Tensor h0, Tensor c0, bool use_peepholes=true, bool is_reverse=false, bool use_seq=true, str gate_activation="sigmoid", str cell_activation="tanh", str candidate_activation="tanh", float scale_data=1.0, float shift_data=0.0, float[] scale_weights={1.0}, bool force_fp32_output=false)
+  output : Tensor(hidden), Tensor(cell), Tensor(xx), Tensor(batched_input), Tensor(batched_hidden), Tensor(batched_cell), Tensor(reordered_h0), Tensor(reordered_c0), Tensor(checked_cell)
+  infer_meta :
+    func : FusionLstmInferMeta
+  kernel :
+    func : fusion_lstm
+    data_type : x
+  optional : h0, c0
+  intermediate : xx, batched_input, batched_hidden, batched_cell, reordered_h0, reordered_c0, checked_cell
+
 - op: multi_gru
   args: (Tensor x, Tensor[] weight_x, Tensor[] weight_h, Tensor[] bias, Tensor[] scale_weights, str activation="tanh", str gate_activation="sigmoid", int layers=1, bool origin_mode=false, str mkldnn_data_type="float32", float scale_data=1.0, float shift_data=1.0, bool force_fp32_output=false)
   output: Tensor(hidden)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index b2e5cc7000f87..fd8c3a409a573 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -111,16 +111,19 @@
 
 - op : fused_elementwise_sub
 
-# - op : fused_matmul
+- op : fused_matmul
 
-# - op : fused_softplus
+- op : fused_softplus
 
-# - op : fused_transpose
+- op : fused_transpose
+  extra_args : str data_format="AnyLayout", str mkldnn_data_type="float32"
+  data_format_tensors : x
 
 - op : fusion_gru
   extra_args : str mkldnn_data_type="float32", float scale_data=1.0, float shift_data=0.0, float[] scale_weights={1.0f}
 
-# - op : fusion_lstm
+- op : fusion_lstm
+  extra_args : str mkldnn_data_type="float32"
 
 - op : gaussian
 
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 9b450977814b6..931c7d4b33624 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -84,6 +84,7 @@ const std::unordered_set<std::string> LegacyOpList = {
     paddle::onednn::dialect::QuantizeOp::name(),
     paddle::onednn::dialect::RequantizeOp::name(),
     paddle::onednn::dialect::MultiGruOp::name(),
+    paddle::onednn::dialect::FusionLstmOp::name(),
 #endif
     CReduceMinOp::name(),
     PushSparseV2Op::name(),
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 74263a1dd522d..840ce5ef29de3 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1445,6 +1445,10 @@
     {x_grad : DX, y_grad : DY, bias_grad : DBias}
 
 - op : fused_transpose
+  inputs:
+    {x : X}
+  outputs :
+    {out : Out}
   extra :
     attrs : [str data_format = "AnyLayout"]
 
@@ -1467,6 +1471,26 @@
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}']
 
 - op : fusion_lstm
+  inputs :
+    x : X
+    h0 : H0
+    weight_x : WeightX
+    weight_h : WeightH
+    bias : Bias
+    c0 : C0
+  outputs :
+    out : Out
+    hidden : Hidden
+    cell : Cell
+    xx : XX
+    batched_input : BatchedInput
+    batched_hidden : BatchedHidden
+    batched_cell : BatchedCell
+    reordered_h0 : ReorderedH0
+    reordered_c0 : ReorderedC0
+    checked_cell : CheckedCell
+  attrs :
+    {scale_data : Scale_data, shift_data : Shift_data, scale_weights : Scale_weights}
   extra :
     attrs : [bool use_mkldnn = true, str mkldnn_data_type = "float32"]
 
@@ -3610,6 +3634,20 @@
   outputs :
     {out : Out, intermediate_out : IntermediateOut}
 
+- op: fused_matmul
+  inputs :
+    {x: X, y: Y, residual_data: ResidualData}
+  outputs :
+    {out : Out}
+  attrs :
+    {scale_x : Scale_x, scale_y : Scale_y, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, fused_reshape_x : fused_reshape_X, fused_transpose_x : fused_transpose_X, fused_reshape_y : fused_reshape_Y, fused_transpose_y : fused_transpose_Y, fused_reshape_out : fused_reshape_Out, fused_transpose_out : fused_transpose_Out}
+
+- op: fused_softplus
+  inputs :
+    {x: X}
+  outputs :
+    {out : Out}
+
 - op: fusion_squared_mat_sub
   inputs :
     x : X
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index af280b44d6501..4af21b36b34da 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -3832,6 +3832,166 @@ void MultiGruInferMeta(
   hidden->share_lod(x);
 }
 
+void FusionLstmInferMeta(const MetaTensor& x,
+                         const MetaTensor& weight_x,
+                         const MetaTensor& weight_h,
+                         const MetaTensor& bias,
+                         const MetaTensor& h0,
+                         const MetaTensor& c0,
+                         const bool use_peepholes,
+                         const bool is_reverse,
+                         const bool use_seq,
+                         const std::string& gate_activation,
+                         const std::string& cell_activation,
+                         const std::string& candidate_activation,
+                         const float scale_data,
+                         const float shift_data,
+                         const std::vector<float>& scale_weights,
+                         const bool force_fp32_output,
+                         MetaTensor* hidden,
+                         MetaTensor* cell,
+                         MetaTensor* xx,
+                         MetaTensor* batched_input,
+                         MetaTensor* batched_hidden,
+                         MetaTensor* batched_cell,
+                         MetaTensor* reordered_h0,
+                         MetaTensor* reordered_c0,
+                         MetaTensor* checked_cell) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Input(X)'s rank must be 2, but received x's rank "
+                        "is:%d, x dim is:[%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  if (h0.initialized()) {
+    PADDLE_ENFORCE_EQ(
+        c0.initialized(),
+        true,
+        phi::errors::InvalidArgument(
+            "fusion_lstm must has h0 and c0 input at the same time."));
+    auto h_dims = h0.dims();
+    auto c_dims = c0.dims();
+    PADDLE_ENFORCE_EQ(h_dims,
+                      c_dims,
+                      phi::errors::InvalidArgument(
+                          "The dimension of Input(H0) and Input(C0) should be "
+                          "same, but received h0 dims is:[%s], c0 dims is:[%s]",
+                          h_dims,
+                          c_dims));
+  }
+
+  auto wx_dims = weight_x.dims();
+  PADDLE_ENFORCE_EQ(wx_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(WeightX) should be 2, but received "
+                        "WeightX's rank is:%d, WeightX dim is:[%s]",
+                        wx_dims.size(),
+                        wx_dims));
+  PADDLE_ENFORCE_EQ(wx_dims[0],
+                    x_dims[1],
+                    phi::errors::InvalidArgument(
+                        "The first dimension of Input(WeightX) "
+                        "should equal to second dimension of Input(X), but "
+                        "received WeightX first dim is:%d, X second dim is:%d",
+                        wx_dims[0],
+                        x_dims[1]));
+
+  int frame_size = static_cast<int>(wx_dims[1] / 4);
+  auto wh_dims = weight_h.dims();
+
+  PADDLE_ENFORCE_EQ(wh_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(WeightH) should be 2, but received "
+                        "WeightH rank is:%d, WeightH dim is:[%s]",
+                        wh_dims.size(),
+                        wh_dims));
+  PADDLE_ENFORCE_EQ(wh_dims[0],
+                    frame_size,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of Input(WeightH) "
+                        "should equal to frame size, but received WeightH "
+                        "first dim is:%d, frame size is:%d.",
+                        wh_dims[0],
+                        frame_size));
+
+  PADDLE_ENFORCE_EQ(wh_dims[1],
+                    4 * frame_size,
+                    phi::errors::InvalidArgument(
+                        "The second dimension of Input(WeightH) "
+                        "should equal to 4 * frame_size, but received WeightH "
+                        "second dimension is:%d, frame size is:%d.",
+                        wh_dims[1],
+                        frame_size));
+
+  auto b_dims = bias.dims();
+  PADDLE_ENFORCE_EQ(b_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Bias) should be 2, but received "
+                        "Bias rank is:%d, Bias dim is:[%s]",
+                        b_dims.size(),
+                        b_dims));
+  PADDLE_ENFORCE_EQ(b_dims[0],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of Input(Bias) should be 1, but "
+                        "received Bias's dimension is:[%s]",
+                        b_dims));
+
+  if (use_peepholes) {
+    PADDLE_ENFORCE_EQ(b_dims[1],
+                      7 * frame_size,
+                      phi::errors::InvalidArgument(
+                          "The second dimension of Input(Bias) should be "
+                          "7 * %d if enable peepholes connection, but received "
+                          "Bias dim is:[%s]",
+                          frame_size,
+                          b_dims));
+    checked_cell->set_dims(phi::make_ddim({2, frame_size}));
+    checked_cell->set_dtype(x.dtype());
+  } else {
+    PADDLE_ENFORCE_EQ(
+        b_dims[1],
+        4 * frame_size,
+        phi::errors::InvalidArgument(
+            "The second dimension of Input(Bias) should be "
+            "4 * %d if disable peepholes, but received Bias dim is:[%s]",
+            frame_size,
+            b_dims));
+  }
+
+  auto out_dims = phi::make_ddim({x_dims[0], frame_size});
+  hidden->set_dims(out_dims);
+  cell->set_dims(out_dims);
+  hidden->share_lod(x);
+  cell->share_lod(x);
+  hidden->set_dtype(x.dtype());
+  cell->set_dtype(x.dtype());
+
+  int xx_width = 0;
+  if (use_seq) {
+    xx_width = static_cast<int>(wx_dims[1]);
+  } else {
+    xx_width =
+        static_cast<int>(x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]);
+
+    batched_input->set_dims(phi::make_ddim({x_dims[0], wx_dims[1]}));
+    batched_hidden->set_dims(out_dims);
+    batched_cell->set_dims(out_dims);
+    batched_input->set_dtype(x.dtype());
+    batched_hidden->set_dtype(x.dtype());
+    batched_cell->set_dtype(x.dtype());
+  }
+  xx->set_dims(phi::make_ddim({x_dims[0], xx_width}));
+  xx->set_dtype(x.dtype());
+  xx->share_lod(x);
+}
+
 void RoformerRelativePosXPUInferMeta(const MetaTensor& x,
                                      const MetaTensor& sin_emb,
                                      const MetaTensor& cos_emb,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 87999ab2b4564..a724000bab9f0 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -861,4 +861,31 @@ void MultiGruInferMeta(
     float shift_data,
     bool force_fp32_output,
     MetaTensor* hidden);
+
+void FusionLstmInferMeta(const MetaTensor& x,
+                         const MetaTensor& weight_x,
+                         const MetaTensor& weight_h,
+                         const MetaTensor& bias,
+                         const MetaTensor& h0,
+                         const MetaTensor& c0,
+                         const bool use_peepholes,
+                         const bool is_reverse,
+                         const bool use_seq,
+                         const std::string& gate_activation,
+                         const std::string& cell_activation,
+                         const std::string& candidate_activation,
+                         const float scale_data,
+                         const float shift_data,
+                         const std::vector<float>& scale_weights,
+                         const bool force_fp32_output,
+                         MetaTensor* hidden,
+                         MetaTensor* cell,
+                         MetaTensor* xx,
+                         MetaTensor* batched_input,
+                         MetaTensor* batched_hidden,
+                         MetaTensor* batched_cell,
+                         MetaTensor* reordered_h0,
+                         MetaTensor* reordered_c0,
+                         MetaTensor* checked_cell);
+
 }  // namespace phi
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 41b9caed79480..c18a142a1ec9d 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -2643,7 +2643,9 @@ def _is_skip_name(self, name):
         static_checker.check()
         outs, fetch_list = static_checker.outputs, static_checker.fetch_list
 
-        if check_pir_onednn and place == base.CPUPlace():
+        if check_pir_onednn and isinstance(
+            place, paddle.base.libpaddle.CPUPlace
+        ):
             with pir_executor_guard():
                 pir_onednn_static_checker = StaticChecker(self, self.outputs)
                 pir_onednn_static_checker.check()
@@ -3313,7 +3315,9 @@ def check_grad_with_place(
             atol,
         )
 
-        if check_pir_onednn and place == base.CPUPlace():
+        if check_pir_onednn and isinstance(
+            place, paddle.base.libpaddle.CPUPlace
+        ):
             with pir_executor_guard():
                 self.check_grad_with_place_for_static(
                     user_defined_grads,
diff --git a/test/legacy_test/test_fusion_lstm_op.py b/test/legacy_test/test_fusion_lstm_op.py
index bbcb5e8a8396c..e733d047daf26 100644
--- a/test/legacy_test/test_fusion_lstm_op.py
+++ b/test/legacy_test/test_fusion_lstm_op.py
@@ -140,7 +140,9 @@ def setUp(self):
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False)
+            self.check_output(
+                check_dygraph=False, check_pir_onednn=self.check_pir_onednn
+            )
 
 
 class TestFusionLSTMOpInit(TestFusionLSTMOp):
diff --git a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
index 9b8f1f684e2a4..c893238e758ec 100644
--- a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
@@ -32,7 +32,10 @@ def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
             self.check_output(
-                check_dygraph=False, no_check_set=["Cell"], atol=2e-2
+                check_dygraph=False,
+                no_check_set=["Cell"],
+                atol=2e-2,
+                check_pir_onednn=True,
             )
 
     def setUp(self):
diff --git a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
index 96bee8d9927bf..c876eb74ff626 100644
--- a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -145,6 +145,7 @@ def test_check_output(self):
                 check_dygraph=False,
                 no_check_set=["Cell"],
                 atol=self.error_margin,
+                check_pir_onednn=True,
             )
 
 
diff --git a/test/mkldnn/test_fusion_lstm_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_mkldnn_op.py
index f9fdfa116acab..7be690aacf42f 100644
--- a/test/mkldnn/test_fusion_lstm_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_mkldnn_op.py
@@ -20,11 +20,16 @@
 class TestFusionLSTMONEDNNOp(TestFusionLSTMOp):
     def set_conf(self):
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False, no_check_set=["Cell"])
+            self.check_output(
+                check_dygraph=False,
+                no_check_set=["Cell"],
+                check_pir_onednn=True,
+            )
 
 
 class TestFusionLSTMONEDNNOpReverse(TestFusionLSTMONEDNNOp):
diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py
index 98429a013f829..00d0ffccbac02 100644
--- a/test/white_list/op_accuracy_white_list.py
+++ b/test/white_list/op_accuracy_white_list.py
@@ -97,4 +97,5 @@
 
 NO_BF16_COMPARED_WITH_FP32_OP_LIST = [
     'dequantize',
+    'fusion_lstm',
 ]

From 4c0243489e3c8f3e6bcfa924ad7ae720338eef0c Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 29 Feb 2024 19:06:24 +0800
Subject: [PATCH 11/55] pir onednn support transpose (#62219)

---
 .../fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml   | 8 ++++++--
 test/mkldnn/test_transpose_bf16_mkldnn_op.py              | 4 +++-
 test/mkldnn/test_transpose_int8_mkldnn_op.py              | 6 +++++-
 test/mkldnn/test_transpose_mkldnn_op.py                   | 8 ++++++--
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index fd8c3a409a573..283761ec09903 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -303,6 +303,10 @@
 
 - op : tanh_grad
 
-# - op : transpose
+- op : transpose
+  extra_args : str data_format="AnyLayout", str mkldnn_data_type="float32"
+  data_format_tensors : x
 
-# - op : transpose_grad
+- op : transpose_grad
+  extra_args : str data_format="AnyLayout", str mkldnn_data_type="float32"
+  data_format_tensors : out_grad
diff --git a/test/mkldnn/test_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_transpose_bf16_mkldnn_op.py
index bd0f8473205d6..4eff0b96bd5d2 100644
--- a/test/mkldnn/test_transpose_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_bf16_mkldnn_op.py
@@ -47,7 +47,9 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
+        self.check_output_with_place(
+            core.CPUPlace(), no_check_set=['XShape'], check_pir_onednn=True
+        )
 
     def init_test_case(self):
         self.shape = (2, 3, 4, 5)
diff --git a/test/mkldnn/test_transpose_int8_mkldnn_op.py b/test/mkldnn/test_transpose_int8_mkldnn_op.py
index b800d6b40c504..e2a3fba8d2bc0 100644
--- a/test/mkldnn/test_transpose_int8_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -50,7 +50,11 @@ def init_op_type(self):
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output_with_place(
-            core.CPUPlace(), 1e-5, no_check_set=['XShape'], check_dygraph=False
+            core.CPUPlace(),
+            1e-5,
+            no_check_set=['XShape'],
+            check_dygraph=False,
+            check_pir_onednn=True,
         )
 
     def initTestCase(self):
diff --git a/test/mkldnn/test_transpose_mkldnn_op.py b/test/mkldnn/test_transpose_mkldnn_op.py
index 66185f9daaf48..34a25cf2f8b1e 100644
--- a/test/mkldnn/test_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_mkldnn_op.py
@@ -38,11 +38,15 @@ def init_op_type(self):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(no_check_set=['XShape'], check_dygraph=False)
+        self.check_output(
+            no_check_set=['XShape'], check_dygraph=False, check_pir_onednn=True
+        )
 
     def test_check_grad(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(['X'], 'Out', check_dygraph=False)
+        self.check_grad(
+            ['X'], 'Out', check_dygraph=False, check_pir_onednn=True
+        )
 
     def initTestCase(self):
         self.shape = (30, 4)

From bd7562d54dbaf18c023746460c6102c6e9d8f058 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Thu, 29 Feb 2024 20:13:28 +0800
Subject: [PATCH 12/55] [Paddle Inference]support sm80 cutlass conv2d  (#62017)

modify ../test/ir/inference/test_cutlass_fused_conv2d_add_act_op.py
add conv+bias+elementwise_add
add some to README.md
* use write_kernel_to_file
* add -std=c++17 in CUDA_NVCC_FLAGS for compiling cut
---
 paddle/fluid/framework/ir/cutlass_teller.h    | 109 ++++++++++-
 .../fusion/cutlass/conv2d/CMakeLists.txt      |  12 +-
 .../kernels/fusion/cutlass/conv2d/README.md   |   6 +
 .../kernels/fusion/cutlass/conv2d/compile.sh  |   2 +-
 .../fusion/cutlass/conv2d/conv2d_bias_act.py  | 176 ++++++++++++++++-
 .../cutlass/conv2d/conv2d_bias_residual.py    | 185 ++++++++++++++++--
 .../fusion/cutlass/conv2d/conv2d_common.py    |  35 +++-
 .../fusion/cutlass/conv2d/conv2d_decl.h       |  17 +-
 .../conv2d/conv2d_depthwise_bias_act.py       |   1 +
 .../fusion/cutlass/conv2d/conv2d_util.cu      |  96 +++++----
 .../fusion/cutlass/conv2d/conv2d_util.h       |   1 +
 .../cutlass/fused_conv2d_add_act_kernel.cu    |  91 ++++++---
 paddle/phi/kernels/fusion/cutlass/util.py     |  26 +++
 13 files changed, 650 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/framework/ir/cutlass_teller.h b/paddle/fluid/framework/ir/cutlass_teller.h
index 3d50544ede13b..2bc829e2fc8e9 100644
--- a/paddle/fluid/framework/ir/cutlass_teller.h
+++ b/paddle/fluid/framework/ir/cutlass_teller.h
@@ -1,5 +1,5 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -20,8 +20,9 @@ namespace framework {
 namespace ir {
 
 typedef enum {
-  cba,
-  cbaa,
+  cba,     // This servers for conv_elementwise_add_fuse_pass
+  cbaa,    // This servers for conv_elementwise_add2_act_fuse_pass
+  cbaele,  // This servers for conv2d_fusion_cutlass_elementwise
 } CutlassFusionType;
 
 class CutlassTeller {
@@ -33,6 +34,7 @@ class CutlassTeller {
 
 #if defined(PADDLE_WITH_CUTLASS)
   // Determine this NCHW conv2d + bias can be fused with activation by cutlass?
+  // This servers for conv_elementwise_add_fuse_pass.
   // will not set or change any attribute in op_desc
   bool CbaCanSupport(OpDesc *op_desc,
                      Scope *scope,
@@ -85,7 +87,8 @@ class CutlassTeller {
   }
 
   // Determine this NCHW conv2d + bias + elewise_add + act can be fused by
-  // cutlass? will not set or change any attribute in op_desc
+  // cutlass?, this is for conv_elementwise_add_fuse_pass
+  // will not set or change any attribute in op_desc
   bool CbaaCanSupport(OpDesc *op_desc,
                       Scope *scope,
                       std::string act_type,
@@ -136,6 +139,69 @@ class CutlassTeller {
     return true;
   }
 
+  // Determine this NCHW conv2d_fusion + elewise_op + act1 can be fused by
+  // cutlass?
+  //  This servers for conv2d_fusion_cutlass_elementwise.
+  // will not set or change any attribute in op_desc
+  bool CbaeleCanSupport(OpDesc *op_desc,
+                        Scope *scope,
+                        std::string ele_type,
+                        std::string act1_type,
+                        int device_id) {
+    auto strides = op_desc->GetAttrIfExists<std::vector<int>>("strides");
+    auto dilations = op_desc->GetAttrIfExists<std::vector<int>>("dilations");
+    CHECK_EQ(strides.size() == 2UL, true);
+    CHECK_EQ(dilations.size() == 2UL, true);
+    int stride_h = strides[0];
+    int stride_w = strides[1];
+    int dilation_h = dilations[0];
+    int dilation_w = dilations[1];
+    auto act_type = op_desc->GetAttrIfExists<std::string>("activation");
+
+    // Do not allow conv2d_fusion already have residual input.
+    if (op_desc->Input("ResidualData").size() >= 1) {
+      return false;
+    }
+
+    auto filter_names = op_desc->Input("Filter");
+
+    for (const auto &filter_name : filter_names) {
+      auto *filter_var = scope->FindLocalVar(filter_name);
+      const auto &filter_tensor = filter_var->Get<phi::DenseTensor>();
+      CHECK_EQ(filter_tensor.dims().size() == 4UL, true);
+      auto groups = op_desc->GetAttrIfExists<int>("groups");
+      int oc = filter_tensor.dims()[0];
+      int kc = filter_tensor.dims()[1];
+      int kh = filter_tensor.dims()[2];
+      int kw = filter_tensor.dims()[3];
+
+      // For convience, we only support EXPLICIT
+      auto padding_algorithm =
+          op_desc->GetAttrIfExists<std::string>("padding_algorithm");
+      if (padding_algorithm != "EXPLICIT") {
+        return false;
+      }
+
+      if (!Conv2dCanSupport(oc,
+                            kc,
+                            kh,
+                            kw,
+                            stride_h,
+                            stride_w,
+                            dilation_h,
+                            dilation_w,
+                            groups,
+                            act_type,
+                            device_id,
+                            CutlassFusionType::cbaele,
+                            act1_type,
+                            ele_type)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Determine whether this conv can be fused with the activation by cutlass
   // backend.
   bool Conv2dCanSupport(int oc,
@@ -149,7 +215,10 @@ class CutlassTeller {
                         int groups,
                         std::string activation,
                         int device_id,
-                        CutlassFusionType fuse_type) {
+                        CutlassFusionType fuse_type,
+                        // below two are used by cbaele
+                        std::string activation1 = "identity",
+                        std::string elemenstwise_type = "elementwise_add") {
     int sm_version = platform::GetGPUComputeCapability(device_id);
     int ic = kc * groups;
     if (!cutlass_sm.count(sm_version)) {
@@ -173,6 +242,14 @@ class CutlassTeller {
           !cbaa_act_set.count(activation)) {
         return false;
       }
+
+      // conv + bias + act + elementwise_op
+      if (fuse_type == CutlassFusionType::cbaele &&
+          !cbaele_act_set.count(activation + "_" + elemenstwise_type + "_" +
+                                activation1)) {
+        return false;
+      }
+
     } else if (groups == ic && ic == oc) {
       // return false;
       //  conv2d_depthwise not support residual input
@@ -250,6 +327,14 @@ class CutlassTeller {
     return false;
   }
 
+  bool CbaeleCanSupport(OpDesc *op_desc,
+                        Scope *scope,
+                        std::string ele_type,
+                        std::string act1_type,
+                        int device_id) {
+    return false;
+  }
+
   bool Conv2dCanSupport(int oc,
                         int kc,
                         int kh,
@@ -261,7 +346,10 @@ class CutlassTeller {
                         int groups,
                         std::string activation,
                         int device_id,
-                        CutlassFusionType fuse_type) {
+                        CutlassFusionType fuse_type,
+                        // below two are used by cbaele
+                        std::string activation1 = "identity",
+                        std::string elemenstwise_type = "elementwise_add") {
     return false;
   }
   std::unordered_set<std::string> CbaAct(int device_id) { return {}; }
@@ -270,6 +358,9 @@ class CutlassTeller {
   static const int CUTLASS_NHWC_ALIGNMENT = 8;
   const std::unordered_set<int> cutlass_sm = {
       75,
+      80,
+      85,
+      86,
   };
   const std::unordered_set<std::string> cba_act_set = {
       "relu", "swish", "identity", "leaky_relu", "sigmoid"};
@@ -278,6 +369,10 @@ class CutlassTeller {
   const std::unordered_set<std::string> cdba_act_set = {
       "identity", "relu", "swish", "sigmoid"};
   const std::unordered_set<std::string> cbaa_act_set = {"relu"};
+  const std::unordered_set<std::string> cbaele_act_set = {
+      "identity_elementwise_add_identity",
+      "swish_elementwise_add_identity",
+  };
 };
 
 }  // namespace ir
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
index cd82bbf1dc8b7..b77a565121bee 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
@@ -21,15 +21,17 @@ execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory
                         "${CMAKE_CURRENT_BINARY_DIR}/generated_tmp")
 
 execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_bias_act.py"
+  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_bias_act.py
+          --cuda_arch ${COMPUTE_CAPABILITY}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_bias_residual.py
+    --cuda_arch ${COMPUTE_CAPABILITY}
   COMMAND ${PYTHON_EXECUTABLE}
-          "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_bias_residual.py"
-  COMMAND ${PYTHON_EXECUTABLE}
-          "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_depthwise_bias_act.py"
+          ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_depthwise_bias_act.py
   WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
 
 find_package(CUDA)
-
+# you can append -std=c++17 in CUDA_NVCC_FLAGS for compiling cutlass 3.0
 set(CUDA_NVCC_FLAGS
     -gencode arch=compute_${COMPUTE_CAPABILITY},code=sm_${COMPUTE_CAPABILITY};)
 #set(CMAKE_CXX_FLAGS -fvisibility=hidden)
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/README.md b/paddle/phi/kernels/fusion/cutlass/conv2d/README.md
index a717b3d692b91..4a2b6c6ac61aa 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/README.md
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/README.md
@@ -23,3 +23,9 @@ compile.sh 脚本中会下载cutlass，执行CMakeLists.txt脚本，编译生成
 step2.
 
 step1执行后，就可以看到在 build 目录生成了 `libCutlassConv2d.so` ，并将build目录添加到LD_LIBRARY_PATH中即可使用此库。
+
+
+step3.
+
+默认情况下，在处理conv2d类算子时，Paddle Inference 会调用cuDNN实现；
+基于 cutlass 开发的conv2d类算子能够融合更多的后处理算子，用户可以通过python API `exp_enable_use_cutlass()` 和 C++ API `Exp_EnableUseCutlass()`来获得一定的速度和显存收益。
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
index 44c0fdf3a04da..d43bda262f543 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
@@ -25,7 +25,7 @@ fi
 
 python_exe_path="python"
 cuda_root_path="/usr/local/cuda"
-gpu_cc="75"
+gpu_cc="80"
 
 cd $build_directory
 cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
index 0cb925489f14a..2104c676c9b82 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
@@ -21,7 +21,7 @@
     CommonTail,
     GenerateFunctionForPhi,
 )
-from util import SubstituteTemplate, TileDesc
+from util import SubstituteTemplate, TileDesc, parse_args, write_kernel_to_file
 
 # this is a file's header part
 
@@ -54,10 +54,10 @@
     + '''
   typename ImplicitGemm::Arguments arguments{
       problem_size,
-      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
-      {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}},
-      {(cutlass::half_t *)(bias), {0, 0, 0}},
-      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
+      {input, {ic, ic * iw, ic * iw * ih}},
+      {weight, {kc, kc * kw, kc * kw * kh}},
+      {bias, {0, 0, 0}},
+      {output, {oc, oc * ow, oc * ow * oh}},
       {1.f, 1.f}};
 '''
     + CommonCutlassConvKernelExecute
@@ -170,10 +170,11 @@ def generate_sm75_1688():
     sm75_code = ""
     for epi_func in SupportedAct:
         op_dict = {}
-        op_dict["func_name"] = UnderScoreName[epi_func].lower() + "_sm75"
+        op_dict["func_name"] = UnderScoreName[epi_func].lower() + "_sm75_fp16"
         op_dict["enum_op_name"] = UnderScoreName[epi_func].upper()
         # For a function, we record all its kernels into a std::vector in C++ code
         all_kernel_names = ""
+        all_kernel_declares = ""
         kernel_dict["epi_func"] = ActTag[epi_func]
         suffix = 0
         for iterator_algorithm in iterator_algorithms:
@@ -203,23 +204,178 @@ def generate_sm75_1688():
                         cba_kernel = cba_kernel_no_alpha
                         if epi_func in [CbaAct.LeakyRelu]:
                             cba_kernel = cba_kernel_alpha
-                        sm75_code += SubstituteTemplate(cba_kernel, kernel_dict)
+                        # sm75_code += SubstituteTemplate(cba_kernel, kernel_dict)
+
+                        kernel_str = (
+                            cba_header
+                            + SubstituteTemplate(cba_kernel, kernel_dict)
+                            + CommonTail
+                        )
+                        file_name = (
+                            "generated_tmp/"
+                            + kernel_dict["kernel_func_name"]
+                            + ".cu"
+                        )
+                        write_kernel_to_file(kernel_str, file_name)
+
                         all_kernel_names += (
                             kernel_dict["kernel_func_name"] + ", \n"
                         )
+                        all_kernel_declares += (
+                            "cutlass::Status "
+                            + kernel_dict["kernel_func_name"]
+                            + "(const ConvAllParams& params);"
+                        )
 
         # Generate op code
+        op_dict["kernel_func_declare"] = all_kernel_declares
         op_dict["all_kernel_func_name"] = all_kernel_names
         sm75_code += SubstituteTemplate(CommonConvFunction, op_dict)
     return sm75_code
 
 
+def generate_sm80_16816(cutlass_dtype="cutlass::half_t"):
+    kernel_dict = {
+        "element_a": cutlass_dtype,
+        "layout_a": "cutlass::layout::TensorNHWC",
+        "element_b": cutlass_dtype,
+        "layout_b": "cutlass::layout::TensorNHWC",
+        "element_c": cutlass_dtype,
+        "layout_c": "cutlass::layout::TensorNHWC",
+        "opcode_class": "cutlass::arch::OpClassTensorOp",
+        "arch": "cutlass::arch::Sm80",
+        "swizzling_functor": "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>",
+        # alpha is always float!
+        "element_epilogue": "float",
+        "math_operator": "cutlass::arch::OpMultiplyAdd",
+    }
+
+    kernel_dict["stride_support"] = "cutlass::conv::StrideSupport::kStrided"
+
+    # iterate over this loop
+    iterator_algorithms = [
+        "cutlass::conv::IteratorAlgorithm::kOptimized",
+    ]
+
+    math_instructions = [
+        (
+            "16,8,16",
+            cutlass_dtype,
+            cutlass_dtype,
+            "float",
+        ),
+    ]
+
+    alignments = [8]
+
+    kernel_dict["align_a"] = "8"
+    kernel_dict["align_b"] = "8"
+    # this should divided by oc
+    kernel_dict["epilogue_vector_length"] = "8"
+    kernel_dict["split_k_slices"] = "1"
+
+    sm80_code = ""
+    for epi_func in SupportedAct:
+        op_dict = {}
+        op_dict["func_name"] = (
+            UnderScoreName[epi_func].lower()
+            + "_sm80_"
+            + ("fp16" if "half" in cutlass_dtype else "bf16")
+        )
+        op_dict["enum_op_name"] = UnderScoreName[epi_func].upper()
+        # For a function, we record all its kernels into a std::vector in C++ code
+        all_kernel_names = ""
+        all_kernel_declares = ""
+        kernel_dict["epi_func"] = ActTag[epi_func]
+        suffix = 0
+        for iterator_algorithm in iterator_algorithms:
+            for alignment in alignments:
+                for math_inst in math_instructions:
+                    tiles = [
+                        TileDesc("256, 128, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("128, 256, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("256, 64, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("256, 64, 32", 4, "64, 64, 32", math_inst),
+                        TileDesc("64, 256, 32", 4, "64, 64, 32", math_inst),
+                        TileDesc("128, 128, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("128, 128, 32", 4, "64, 64, 32", math_inst),
+                        TileDesc("128, 128, 32", 5, "64, 64, 32", math_inst),
+                        TileDesc("128, 64, 32", 6, "64, 32, 32", math_inst),
+                        TileDesc("64, 128, 32", 6, "32, 64, 32", math_inst),
+                        TileDesc("64, 64, 32", 10, "32, 32, 32", math_inst),
+                        TileDesc("256, 128, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("128, 256, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("256, 64, 64", 4, "64, 64, 64", math_inst),
+                        TileDesc("64, 256, 64", 4, "64, 64, 64", math_inst),
+                        TileDesc("128, 128, 64", 4, "64, 64, 64", math_inst),
+                        TileDesc("256, 64, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("64, 256, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("128, 128, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("128, 64, 64", 3, "64, 32, 64", math_inst),
+                        TileDesc("64, 128, 64", 3, "32, 64, 64", math_inst),
+                        TileDesc("64, 64, 64", 5, "32, 32, 64", math_inst),
+                    ]
+                    for tile in tiles:
+                        kernel_dict["iterator_algorithm"] = iterator_algorithm
+                        kernel_dict["Tshape"] = tile.Tshape
+                        kernel_dict["Wshape"] = tile.Wshape
+                        kernel_dict["Ishape"] = tile.math_inst[0]
+                        kernel_dict["stages"] = str(tile.stages)
+                        kernel_dict["element_accum"] = tile.math_inst[3]
+                        kernel_dict["kernel_func_name"] = op_dict[
+                            "func_name"
+                        ] + str(suffix)
+                        suffix += 1
+                        cba_kernel = cba_kernel_no_alpha
+                        if epi_func in [CbaAct.LeakyRelu]:
+                            cba_kernel = cba_kernel_alpha
+                        # sm80_code += SubstituteTemplate(cba_kernel, kernel_dict)
+
+                        kernel_str = (
+                            cba_header
+                            + SubstituteTemplate(cba_kernel, kernel_dict)
+                            + CommonTail
+                        )
+                        file_name = (
+                            "generated_tmp/"
+                            + kernel_dict["kernel_func_name"]
+                            + ".cu"
+                        )
+                        write_kernel_to_file(kernel_str, file_name)
+
+                        all_kernel_names += (
+                            kernel_dict["kernel_func_name"] + ", \n"
+                        )
+
+                        all_kernel_declares += (
+                            "cutlass::Status "
+                            + kernel_dict["kernel_func_name"]
+                            + "(const ConvAllParams& params);"
+                        )
+
+        # Generate op code
+        op_dict["kernel_func_declare"] = all_kernel_declares
+        op_dict["all_kernel_func_name"] = all_kernel_names
+        sm80_code += SubstituteTemplate(CommonConvFunction, op_dict)
+    return sm80_code
+
+
 if __name__ == "__main__":
-    sm_versions = ["75"]
+    sm_versions_and_types = []
+    args = parse_args()
+
     all_code = cba_header
-    all_code += generate_sm75_1688()
+    if args.cuda_arch == "75":
+        sm_versions_and_types.append(["75", "fp16"])
+        all_code += generate_sm75_1688()
+    if args.cuda_arch in ["80", "86", "89"]:
+        sm_versions_and_types.append(["80", "fp16"])
+        sm_versions_and_types.append(["80", "bf16"])
+        all_code += generate_sm80_16816()
+        all_code += generate_sm80_16816(cutlass_dtype="cutlass::bfloat16_t")
+
     all_code += GenerateFunctionForPhi(
-        sm_versions, SupportedAct, UnderScoreName, CamelName
+        sm_versions_and_types, SupportedAct, UnderScoreName, CamelName
     )
     all_code += CommonTail
     with open("generated_tmp/conv2d_bias_act.cu", "w") as f:
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
index 55fde0722b6b3..629ffc12415e9 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
@@ -21,7 +21,7 @@
     CommonTail,
     GenerateFunctionForPhi,
 )
-from util import SubstituteTemplate, TileDesc
+from util import SubstituteTemplate, TileDesc, parse_args, write_kernel_to_file
 
 # this is a file's header part
 
@@ -48,13 +48,12 @@
 cbr_kernel = (
     SubstituteTemplate(CommonCutlassConvKernelDeclare, dict_for_declare_part)
     + '''
-  const half *residual = params.residual;
   typename ImplicitGemm::Arguments arguments{
       problem_size,
-      {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}},
-      {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}},
-      {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}},
-      {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}},
+      {input, {ic, ic * iw, ic * iw * ih}},
+      {weight, {kc, kc * kw, kc * kw * kh}},
+      {residual, {oc, oc * ow, oc * ow * oh}},
+      {output, {oc, oc * ow, oc * ow * oh}},
       {1.f, 1.f},
       cutlass::conv::SplitKMode::kSerial,
       (cutlass::half_t *)(bias), nullptr,
@@ -80,16 +79,19 @@ class CbrAct(enum.Enum):
 SupportedEpilogue = [
     (CbrAct.Silu, "cutlass::plus", CbrAct.Identity),
     (CbrAct.Identity, "cutlass::plus", CbrAct.Relu),
+    (CbrAct.Identity, "cutlass::plus", CbrAct.Identity),
 ]
 
 UnderScoreName = {
     SupportedEpilogue[0]: "conv2d_bias_silu_add",
     SupportedEpilogue[1]: "conv2d_bias_add_relu",
+    SupportedEpilogue[2]: "conv2d_bias_add",
 }
 
 CamelName = {
     SupportedEpilogue[0]: "Conv2dBiasSiluAdd",
     SupportedEpilogue[1]: "Conv2dBiasAddRelu",
+    SupportedEpilogue[2]: "Conv2dBiasAdd",
 }
 
 # Generate sm75 TensorOp conv code.
@@ -150,10 +152,13 @@ def generate_sm75_1688():
     sm75_code = ""
     for epi_res_block in SupportedEpilogue:
         op_dict = {}
-        op_dict["func_name"] = UnderScoreName[epi_res_block].lower() + "_sm75"
+        op_dict["func_name"] = (
+            UnderScoreName[epi_res_block].lower() + "_sm75_fp16"
+        )
         op_dict["enum_op_name"] = UnderScoreName[epi_res_block].upper()
         # for a op, we record all its kernels into a std::vector in C++ code
         all_kernel_names = ""
+        all_kernel_declares = ""
         suffix = 0
         for iterator_algorithm in iterator_algorithms:
             for alignment in alignments:
@@ -188,23 +193,179 @@ def generate_sm75_1688():
                         kernel_dict["act2"] = ActTag[epi_res_block[2]]
                         suffix += 1
 
-                        sm75_code += SubstituteTemplate(cbr_kernel, kernel_dict)
+                        # sm75_code += SubstituteTemplate(cbr_kernel, kernel_dict)
+
+                        kernel_str = (
+                            cbr_header
+                            + SubstituteTemplate(cbr_kernel, kernel_dict)
+                            + CommonTail
+                        )
+                        file_name = (
+                            "generated_tmp/"
+                            + kernel_dict["kernel_func_name"]
+                            + ".cu"
+                        )
+                        write_kernel_to_file(kernel_str, file_name)
+
                         all_kernel_names += (
                             kernel_dict["kernel_func_name"] + ", \n"
                         )
+                        all_kernel_declares += (
+                            "cutlass::Status "
+                            + kernel_dict["kernel_func_name"]
+                            + "(const ConvAllParams& params);"
+                        )
 
-        # Generate op code with sm_version
+        # Generate op code
+        op_dict["kernel_func_declare"] = all_kernel_declares
         op_dict["all_kernel_func_name"] = all_kernel_names
         sm75_code += SubstituteTemplate(CommonConvFunction, op_dict)
     return sm75_code
 
 
+def generate_sm80_16816(cutlass_dtype="cutlass::half_t"):
+    kernel_dict = {
+        "conv_kind_name": "Fprop",
+        "element_a": cutlass_dtype,
+        "layout_a": "cutlass::layout::TensorNHWC",
+        "element_b": cutlass_dtype,
+        "layout_b": "cutlass::layout::TensorNHWC",
+        "element_c": cutlass_dtype,
+        "layout_c": "cutlass::layout::TensorNHWC",
+        "opcode_class": "cutlass::arch::OpClassTensorOp",
+        "arch": "cutlass::arch::Sm80",
+        "swizzling_functor": "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>",
+        # alpha is always float!
+        "element_epilogue": "float",
+        "math_operator": "cutlass::arch::OpMultiplyAdd",
+        "element_residul": cutlass_dtype,
+    }
+
+    kernel_dict["stride_support"] = "cutlass::conv::StrideSupport::kStrided"
+
+    # iterate over this loop
+    iterator_algorithms = [
+        "cutlass::conv::IteratorAlgorithm::kOptimized",
+    ]
+
+    math_instructions = [
+        (
+            "16,8,16",
+            cutlass_dtype,
+            cutlass_dtype,
+            "float",
+        ),
+    ]
+
+    alignments = [8]
+
+    kernel_dict["align_a"] = "8"
+    kernel_dict["align_b"] = "8"
+    kernel_dict["epilogue_vector_length"] = "8"
+    kernel_dict["split_k_slices"] = "1"
+
+    sm80_code = ""
+    for epi_res_block in SupportedEpilogue:
+        op_dict = {}
+        op_dict["func_name"] = (
+            UnderScoreName[epi_res_block].lower()
+            + "_sm80_"
+            + ("fp16" if "half" in cutlass_dtype else "bf16")
+        )
+
+        op_dict["enum_op_name"] = UnderScoreName[epi_res_block].upper()
+        # for a op, we record all its kernels into a std::vector in C++ code
+        all_kernel_names = ""
+        all_kernel_declares = ""
+        suffix = 0
+        for iterator_algorithm in iterator_algorithms:
+            for alignment in alignments:
+                for math_inst in math_instructions:
+                    tiles = [
+                        TileDesc("256, 128, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("128, 256, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("256, 64, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("256, 64, 32", 4, "64, 64, 32", math_inst),
+                        TileDesc("64, 256, 32", 4, "64, 64, 32", math_inst),
+                        TileDesc("128, 128, 32", 3, "64, 64, 32", math_inst),
+                        TileDesc("128, 128, 32", 4, "64, 64, 32", math_inst),
+                        TileDesc("128, 128, 32", 5, "64, 64, 32", math_inst),
+                        TileDesc("128, 64, 32", 6, "64, 32, 32", math_inst),
+                        TileDesc("64, 128, 32", 6, "32, 64, 32", math_inst),
+                        TileDesc("64, 64, 32", 10, "32, 32, 32", math_inst),
+                        TileDesc("256, 128, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("128, 256, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("256, 64, 64", 4, "64, 64, 64", math_inst),
+                        TileDesc("64, 256, 64", 4, "64, 64, 64", math_inst),
+                        TileDesc("128, 128, 64", 4, "64, 64, 64", math_inst),
+                        TileDesc("256, 64, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("64, 256, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("128, 128, 64", 3, "64, 64, 64", math_inst),
+                        TileDesc("128, 64, 64", 3, "64, 32, 64", math_inst),
+                        TileDesc("64, 128, 64", 3, "32, 64, 64", math_inst),
+                        TileDesc("64, 64, 64", 5, "32, 32, 64", math_inst),
+                    ]
+
+                    for tile in tiles:
+                        kernel_dict["iterator_algorithm"] = iterator_algorithm
+                        kernel_dict["Tshape"] = tile.Tshape
+                        kernel_dict["Wshape"] = tile.Wshape
+                        kernel_dict["Ishape"] = tile.math_inst[0]
+                        kernel_dict["stages"] = str(tile.stages)
+                        kernel_dict["element_accum"] = tile.math_inst[3]
+                        kernel_dict["kernel_func_name"] = op_dict[
+                            "func_name"
+                        ] + str(suffix)
+                        kernel_dict["act1"] = ActTag[epi_res_block[0]]
+                        kernel_dict["binary"] = epi_res_block[1]
+                        kernel_dict["act2"] = ActTag[epi_res_block[2]]
+                        suffix += 1
+
+                        # sm80_code += SubstituteTemplate(cbr_kernel, kernel_dict)
+                        kernel_str = (
+                            cbr_header
+                            + SubstituteTemplate(cbr_kernel, kernel_dict)
+                            + CommonTail
+                        )
+                        file_name = (
+                            "generated_tmp/"
+                            + kernel_dict["kernel_func_name"]
+                            + ".cu"
+                        )
+                        write_kernel_to_file(kernel_str, file_name)
+
+                        all_kernel_names += (
+                            kernel_dict["kernel_func_name"] + ", \n"
+                        )
+                        all_kernel_declares += (
+                            "cutlass::Status "
+                            + kernel_dict["kernel_func_name"]
+                            + "(const ConvAllParams& params);"
+                        )
+
+        # Generate op code
+        op_dict["kernel_func_declare"] = all_kernel_declares
+        op_dict["all_kernel_func_name"] = all_kernel_names
+        sm80_code += SubstituteTemplate(CommonConvFunction, op_dict)
+    return sm80_code
+
+
 if __name__ == "__main__":
-    sm_versions = ["75"]
+    sm_versions_and_types = []
+    args = parse_args()
+
     all_code = cbr_header
-    all_code += generate_sm75_1688()
+    if args.cuda_arch == "75":
+        sm_versions_and_types.append(["75", "fp16"])
+        all_code += generate_sm75_1688()
+    if args.cuda_arch in ["80", "86", "89"]:
+        sm_versions_and_types.append(["80", "fp16"])
+        sm_versions_and_types.append(["80", "bf16"])
+        all_code += generate_sm80_16816()
+        all_code += generate_sm80_16816(cutlass_dtype="cutlass::bfloat16_t")
+
     all_code += GenerateFunctionForPhi(
-        sm_versions, SupportedEpilogue, UnderScoreName, CamelName
+        sm_versions_and_types, SupportedEpilogue, UnderScoreName, CamelName
     )
     all_code += CommonTail
     with open("generated_tmp/conv2d_bias_residual.cu", "w") as f:
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
index 7c95892006c43..6dbf6bcbbb82a 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
@@ -51,10 +51,14 @@
 
   using ImplicitGemm =
       cutlass::conv::device::ImplicitGemmConvolution<kernel_base>;
-  const half *input = params.input;
-  const half *weight = params.weight;
-  const half *bias = params.bias;
-  half *output = params.output;
+
+  ${element_a} *input = (${element_a} *)(params.input);
+  ${element_b} *weight = (${element_b} *)(params.weight);
+  ${element_c} *bias = (${element_c} *)(params.bias);
+  ${element_c} *output = (${element_c} *)(params.output);
+  // only used by conv2d_bias_residual
+ auto residual = (${element_c} *)(params.residual);
+
   int batch = params.batch;
   int ic = params.ic;
   int ih = params.ih;
@@ -112,6 +116,9 @@
 # ${enum_op_name} is like CONV2D_BIAS_SILU
 
 CommonConvFunction = """
+
+${kernel_func_declare}
+
 std::vector<std::function<cutlass::Status(const ConvAllParams)>>
     ${func_name}_all_func =  {${all_kernel_func_name}};
 
@@ -163,8 +170,15 @@
 """
 
 
+def convert_c_data_type(dtype):
+    if dtype == "fp16":
+        return "Conv2dDataType::fp16"
+    if dtype == "bf16":
+        return "Conv2dDataType::bf16"
+
+
 CommonDispatchTemp = '''
-    if (params.sm_version == ${sm_code})
+    if (params.sm_version == ${sm_code} && params.data_type == ${data_type})
     {
         ${op_name_with_sm}(params);
     }
@@ -182,16 +196,21 @@
 
 # Wrap different sm versions into a function called by phi
 def GenerateFunctionForPhi(
-    sm_versions, support_epi_funcs, underscore_names, camel_names
+    sm_versions_and_types, support_epi_funcs, underscore_names, camel_names
 ):
     generated_code = ""
     for epi_func in support_epi_funcs:
         dispatch_body = ""
-        for sm_version in sm_versions:
+        for sm_version, data_type in sm_versions_and_types:
             sm_dicts = {}
             sm_dicts["sm_code"] = sm_version
+            sm_dicts["data_type"] = convert_c_data_type(data_type)
             sm_dicts["op_name_with_sm"] = (
-                underscore_names[epi_func].lower() + "_sm" + sm_version
+                underscore_names[epi_func].lower()
+                + "_sm"
+                + sm_version
+                + "_"
+                + data_type
             )
             dispatch_body += SubstituteTemplate(CommonDispatchTemp, sm_dicts)
         op_dicts = {}
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
index aaad46de5cb0d..b29ce65f5230a 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
@@ -20,12 +20,18 @@ namespace phi {
 namespace fusion {
 namespace cutlass_internal {
 
+typedef enum {
+  fp32,
+  fp16,
+  bf16,
+} Conv2dDataType;
+
 typedef struct {
-  const half *input;
-  const half *weight;
-  const half *bias;
-  const half *residual;
-  half *output;
+  const void *input;
+  const void *weight;
+  const void *bias;
+  const void *residual;
+  void *output;
   int batch;
   int ic;
   int ih;
@@ -48,6 +54,7 @@ typedef struct {
   cudaStream_t stream;
   float alpha;  // for leaky_relu use
   int sm_version = 75;
+  Conv2dDataType data_type;
   void *workspace = nullptr;
 } ConvAllParams;
 
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
index fb2f2be096110..5114d69e97060 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
@@ -208,6 +208,7 @@ def generate_conv2d_depthwise():
                         )
         # generate op code
         op_dict["all_kernel_func_name"] = all_kernel_names
+        op_dict["kernel_func_declare"] = ";"
         all_code += SubstituteTemplate(CommonConvFunction, op_dict)
     return all_code
 
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
index 51bc71983105a..0a08cd165519d 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
@@ -26,10 +26,11 @@ struct logical_coord {
   int w;
 };
 
-float diff(const half *c, const float *c_baseline, int n) {
+template <typename T>
+float diff(const T *c, const float *c_baseline, int n) {
   float max_diff = -1.;
   for (int i = 0; i < n; i++) {
-    float c_value = __half2float(c[i]);
+    float c_value = static_cast<float>(c[i]);
     if (std::abs(c_baseline[i] - c_value) > max_diff) {
       max_diff = std::abs(c_baseline[i] - c_value);
     }
@@ -42,10 +43,10 @@ __device__ int gpu_nhwc(struct logical_coord shape,
   return index.n * shape.h * shape.w * shape.c + index.h * shape.w * shape.c +
          index.w * shape.c + index.c;
 }
-
-__global__ void naive_conv2d_kernel(const half *input,
-                                    const half *weight,
-                                    const half *bias,
+template <typename T = half>
+__global__ void naive_conv2d_kernel(const T *input,
+                                    const T *weight,
+                                    const T *bias,
                                     float *output,
                                     int batch,
                                     int ic,
@@ -63,7 +64,7 @@ __global__ void naive_conv2d_kernel(const half *input,
                                     int oh,
                                     int ow,
                                     int groups,
-                                    const half *residual,
+                                    const T *residual,
                                     float alpha,  // for leaky_relu
                                     OpType op_type) {
   int M = batch * oh * ow;
@@ -100,12 +101,12 @@ __global__ void naive_conv2d_kernel(const half *input,
     if (iw_i < 0 || iw_i >= iw) continue;
 
     struct logical_coord input_index = {batch_i, ic_i, ih_i, iw_i};
-    const half *weight_ptr = weight + gpu_nhwc(weight_shape, weight_index);
-    const half *in_ptr = input + gpu_nhwc(input_shape, input_index);
-    sum += __half2float(*in_ptr) * __half2float(*weight_ptr);
+    const T *weight_ptr = weight + gpu_nhwc(weight_shape, weight_index);
+    const T *in_ptr = input + gpu_nhwc(input_shape, input_index);
+    sum += static_cast<float>(*in_ptr) * static_cast<float>(*weight_ptr);
   }
 
-  sum += __half2float(*(bias + oc_i));
+  sum += static_cast<float>(*(bias + oc_i));
   float x = sum;
 
   switch (op_type) {
@@ -121,10 +122,19 @@ __global__ void naive_conv2d_kernel(const half *input,
     case CONV2D_DEPTHWISE_BIAS_SILU:
       *out_ptr = x * (1.f / (1 + exp(-x)));
       break;
+    case CONV2D_BIAS_SILU_ADD:
+      x = x * (1.f / (1 + exp(-x)));
+      x += static_cast<float>(*(residual + out_offset));
+      *out_ptr = x;
+      break;
     case CONV2D_BIAS_ADD_RELU:
-      x += __half2float(*(residual + out_offset));
+      x += static_cast<float>(*(residual + out_offset));
       *out_ptr = x > 0 ? x : 0;
       break;
+    case CONV2D_BIAS_ADD:
+      x += static_cast<float>(*(residual + out_offset));
+      *out_ptr = x;
+      break;
     case CONV2D_BIAS_LEAKY_RELU:
       *out_ptr = x > 0 ? x : (x * alpha);
       break;
@@ -136,12 +146,12 @@ __global__ void naive_conv2d_kernel(const half *input,
       break;
   }
 }
-
-float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type) {
-  const half *input = params.input;
-  const half *weight = params.weight;
-  const half *bias = params.bias;
-  half *output = params.output;
+template <typename T>
+float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type, T a) {
+  const T *input = (const T *)(params.input);
+  const T *weight = (const T *)(params.weight);
+  const T *bias = (const T *)(params.bias);
+  T *output = static_cast<T *>(params.output);
   int batch = params.batch;
   int ic = params.ic;
   int ih = params.ih;
@@ -155,7 +165,7 @@ float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type) {
   int stride_w = params.stride_w;
   int dilation_h = params.dilation_h;
   int dilation_w = params.dilation_w;
-  const half *residual = params.residual;
+  const T *residual = (const T *)(params.residual);
   int groups = params.groups;
 
   int oh = params.oh;
@@ -169,11 +179,11 @@ float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type) {
   uint3 block = {blockM, blockN, 1};
 
   int output_size = batch * oc * oh * ow;
-  half *output_from_cutlass =
-      reinterpret_cast<half *>(malloc(sizeof(half) * output_size));
+  T *output_from_cutlass =
+      reinterpret_cast<T *>(malloc(sizeof(T) * output_size));
   cudaMemcpy(output_from_cutlass,
              output,
-             output_size * sizeof(half),
+             output_size * sizeof(T),
              cudaMemcpyDeviceToHost);
 
   float *gpu_output;
@@ -207,6 +217,13 @@ float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type) {
              gpu_output,
              output_size * sizeof(float),
              cudaMemcpyDeviceToHost);
+
+  // cudaMemcpy(output,
+  //            gpu_output,
+  //            output_size * sizeof(T),
+  //            cudaMemcpyDeviceToDevice);
+  // cudaMemset(output, 0, output_size * sizeof(T));
+
   float max_diff = diff(output_from_cutlass, output_from_gpu, output_size);
 
   free(output_from_cutlass);
@@ -232,6 +249,12 @@ std::string OpType2String(OpType op_type) {
     case CONV2D_BIAS_ADD_RELU:
       return "conv2d_bias_add_relu";
       break;
+    case CONV2D_BIAS_ADD:
+      return "conv2d_bias_add";
+      break;
+    case CONV2D_BIAS_SILU_ADD:
+      return "conv2d_bias_silu_add";
+      break;
     case CONV2D_BIAS_LEAKY_RELU:
       return "conv2d_bias_leaky_relu";
     case CONV2D_DEPTHWISE_BIAS:
@@ -253,7 +276,7 @@ int ProfileToGetBestConfig(
     const ConvAllParams &params,
     OpType op_type) {
   constexpr int WARMUP = 10;
-  constexpr int REPEAT = 100;
+  constexpr int REPEAT = 10;
   float min_time = 100000.f;
   int min_time_index = -1;
   for (int i = 0; i < all_func.size(); i++) {
@@ -286,11 +309,23 @@ int ProfileToGetBestConfig(
     if (elapsed_time < min_time && status == cutlass::Status::kSuccess) {
       min_time = elapsed_time;
       min_time_index = i;
-      // debug code
-      std::cout << OpType2String(op_type) << ": tactic " << i
-                << " has max diff " << conv2d_diff_gpu(params, op_type)
-                << " compared with baseline,"
-                << "cost_time: " << elapsed_time << "ms." << std::endl;
+
+      if (params.data_type == Conv2dDataType::fp16) {
+        // debug code
+        std::cout << OpType2String(op_type) << ": tactic " << i
+                  << " has max diff "
+                  << conv2d_diff_gpu(params, op_type, (half)(1.0))
+                  << " compared with baseline,"
+                  << "cost_time: " << elapsed_time << "ms." << std::endl;
+      } else if (params.data_type == Conv2dDataType::bf16) {
+        // debug code
+        std::cout << OpType2String(op_type) << ": tactic " << i
+                  << " has max diff "
+                  << conv2d_diff_gpu<float>(
+                         params, op_type, static_cast<float>(1.0))
+                  << " compared with baseline,"
+                  << "cost_time: " << elapsed_time << "ms." << std::endl;
+      }
     }
   }
 
@@ -301,11 +336,6 @@ int ProfileToGetBestConfig(
   return min_time_index;
 }
 
-__attribute__((dllexport)) int HelloFromCutlassConv2d(int a, int b) {
-  std::cout << "welcom using Cutlass Conv2d" << std::endl;
-  return 1;
-}
-
 }  // namespace cutlass_internal
 }  // namespace fusion
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
index 80865e0e1cded..508b8a8f1ae3b 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
@@ -37,6 +37,7 @@ typedef enum {
   CONV2D_BIAS,
   CONV2D_BIAS_RELU,
   CONV2D_BIAS_ADD_RELU,
+  CONV2D_BIAS_ADD,
   CONV2D_BIAS_SILU,
   CONV2D_BIAS_LEAKY_RELU,
   CONV2D_BIAS_SIGMOID,
diff --git a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
index dceaafd2e7172..5c09b92fd83de 100644
--- a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
@@ -98,30 +98,66 @@ void FusedConv2dAddActKernel(const Context& ctx,
   const int oh = out_dims[1];
   const int ow = out_dims[2];
 
-  ConvAllParams params = {reinterpret_cast<const half*>(x.data<T>()),
-                          reinterpret_cast<const half*>(filter.data<T>()),
-                          reinterpret_cast<const half*>(bias.data<T>()),
-                          nullptr,
-                          reinterpret_cast<half*>(output->data<T>()),
-                          batch,
-                          ic,
-                          ih,
-                          iw,
-                          kh,
-                          kw,
-                          oc,
-                          pad_h0,
-                          pad_h1,
-                          pad_w0,
-                          pad_w1,
-                          stride_h,
-                          stride_w,
-                          dilation_h,
-                          dilation_w,
-                          oh,
-                          ow,
-                          groups,
-                          ctx.stream()};
+  int64_t device_id = ctx.GetPlace().GetDeviceId();
+  int sm_version = backends::gpu::GetGPUComputeCapability(device_id);
+
+  auto get_conv2d_dtype = [&](decltype(x.dtype()) x_type)
+      -> phi::fusion::cutlass_internal::Conv2dDataType {
+    switch (x_type) {
+      case phi::DataType::FLOAT32:
+        return Conv2dDataType::fp32;
+      case phi::DataType::FLOAT16:
+        return Conv2dDataType::fp16;
+      case phi::DataType::BFLOAT16:
+        return Conv2dDataType::bf16;
+    }
+  };
+
+  auto cutlass_dispatch_sm_version = [&](int device_sm_version) -> int {
+    if (device_sm_version < 75) {
+      PADDLE_ENFORCE_GE(
+          device_sm_version,
+          75,
+          phi::errors::PreconditionNotMet(
+              "fused_conv2d_add_act only supports sm >= 75, but got %d.",
+              device_sm_version));
+    } else if (device_sm_version > 80) {
+      return 80;
+    } else {
+      return device_sm_version;
+    }
+  };
+
+  ConvAllParams params = {
+      reinterpret_cast<const void*>(x.data<T>()),
+      reinterpret_cast<const void*>(filter.data<T>()),
+      reinterpret_cast<const void*>(bias.data<T>()),
+      nullptr,
+      reinterpret_cast<void*>(output->data<T>()),
+      batch,
+      ic,
+      ih,
+      iw,
+      kh,
+      kw,
+      oc,
+      pad_h0,
+      pad_h1,
+      pad_w0,
+      pad_w1,
+      stride_h,
+      stride_w,
+      dilation_h,
+      dilation_w,
+      oh,
+      ow,
+      groups,
+      ctx.stream(),
+      0,  // alpha
+      cutlass_dispatch_sm_version(sm_version),
+      get_conv2d_dtype(x.dtype()),
+      nullptr,
+  };
 
   void* dlhandler = phi::dynload::GetCutlassConv2dHandle();
   func conv_func = NULL;
@@ -161,11 +197,13 @@ void FusedConv2dAddActKernel(const Context& ctx,
   CHECK_EQ(groups == 1, true);
   if (residual) {
     if (activation == "relu") {
-      params.residual = reinterpret_cast<const half*>(residual->data<T>());
+      params.residual = reinterpret_cast<const void*>(residual->data<T>());
       conv_func = (func)(dlsym(dlhandler, "Conv2dBiasAddRelu"));
     } else {
       PADDLE_THROW(phi::errors::InvalidArgument(
-          "Cutlass now only support relu activation in a residual block"));
+          "Cutlass now only support relu activation in a residual block, but "
+          "got %s.",
+          activation.c_str()));
     }
   } else if (activation == "relu") {
     conv_func = (func)(dlsym(dlhandler, "Conv2dBiasRelu"));
@@ -194,4 +232,5 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act,
                    ALL_LAYOUT,
                    phi::fusion::cutlass_internal::FusedConv2dAddActKernel,
                    float,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/cutlass/util.py b/paddle/phi/kernels/fusion/cutlass/util.py
index 200960f39c56e..d3ffb648362f6 100644
--- a/paddle/phi/kernels/fusion/cutlass/util.py
+++ b/paddle/phi/kernels/fusion/cutlass/util.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import re
 
 
@@ -35,3 +36,28 @@ def SubstituteTemplate(template, values):
                 changed = True
             text = newtext
     return text
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="The argument for generating the conv2d_bias_act kernels."
+    )
+
+    parser.add_argument(
+        "--cuda_arch",
+        type=str,
+        default=None,
+        help="The CUDA architecture to be generated.",
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def write_kernel_to_file(kernel, file_name):
+    with open(
+        file_name,
+        "w",
+    ) as f:
+        f.write(kernel)
+        f.close()

From becb078860c32cdeabf22083f322b7bc6480edb8 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Thu, 29 Feb 2024 20:56:30 +0800
Subject: [PATCH 13/55] [Inference] Fix absolute paths bug in tensorrt_engine
 op (#62205)

* fix absolute paths bug in tensorrt_engine op

* fix bug

* fix bug

* fix bug
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |  4 +--
 .../passes/save_optimized_model_pass.cc       |  4 +--
 .../fluid/inference/api/analysis_predictor.cc | 27 ++++++++++++++++---
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 69b27b1214839..5b2bed7745fcf 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -506,8 +506,8 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
                                            &max_shape_tensor,
                                            &optim_shape_tensor);
     } else {
-      shape_range_info_path =
-          Get<std::string>("model_opt_cache_dir") + "shape_range_info.pbtxt";
+      shape_range_info_path = Get<std::string>("model_opt_cache_dir") + "/" +
+                              "shape_range_info.pbtxt";
       if (open(shape_range_info_path.c_str(), O_RDONLY) != -1) {
         VLOG(1) << "trt dynamic_shape deserialize from "
                 << shape_range_info_path;
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index cc463ce45f105..8d988de162100 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -74,7 +74,7 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
       }
     }
 
-    std::string save_params_path = path + ".pdiparams";
+    std::string save_params_path = path + "/" + "_optimized.pdiparams";
     std::vector<std::string> save_var_list(save_var_set.begin(),
                                            save_var_set.end());
     std::sort(save_var_list.begin(), save_var_list.end());
@@ -105,7 +105,7 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
         }
       }
     }
-    std::string save_model_path = path + ".pdmodel";
+    std::string save_model_path = path + "/" + "_optimized.pdmodel";
     auto str = optimized_program_desc.Proto()->SerializeAsString();
     std::ofstream file(save_model_path.c_str(), std::ios::binary);
     file.write(str.c_str(), str.size());  // NOLINT
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b61e8eaa0577d..d52f71573dc44 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -424,8 +424,10 @@ bool AnalysisPredictor::Init(
   // Use Optimized model to inference
   if (config_.use_optimized_model_) {
     std::string optimized_model_path = GetOptimizedModelPath();
-    std::string optimized_model = optimized_model_path + ".pdmodel";
-    std::string optimized_params = optimized_model_path + ".pdiparams";
+    std::string optimized_model =
+        optimized_model_path + "/" + "_optimized.pdmodel";
+    std::string optimized_params =
+        optimized_model_path + "/" + "_optimized.pdiparams";
     if (FileExists(optimized_model) && FileExists(optimized_params)) {
       config_.SetModel(optimized_model, optimized_params);
       LOG(INFO) << "Load Optimized model from " << optimized_model_path;
@@ -596,7 +598,7 @@ std::string AnalysisPredictor::GetOptimizedModelPath() {
             ? config_.model_dir()
             : inference::analysis::GetDirRoot(config_.prog_file());
   }
-  return model_opt_cache_dir + "/" + "_optimized";
+  return model_opt_cache_dir;
 }
 
 void AnalysisPredictor::ClearExtraParams() {
@@ -608,6 +610,25 @@ void AnalysisPredictor::ClearExtraParams() {
                                          op_desc->GetAttr("parameters"));
       trt_repetitive_params.insert(
           trt_repetitive_params.end(), trt_params.begin(), trt_params.end());
+      // NOTE(ming1753): This is a trick solution to the problem of possible
+      // absolute paths in the model_opt_cache_dir and shape_range_info_path
+      // attributes in tensorrt_engine op.
+      auto model_opt_cache_dir_from_model = PADDLE_GET_CONST(
+          std::string, op_desc->GetAttr("model_opt_cache_dir"));
+      auto model_opt_cache_dir = GetOptimizedModelPath();
+      if (op_desc->HasAttr("model_opt_cache_dir")) {
+        op_desc->SetAttr("model_opt_cache_dir", model_opt_cache_dir);
+      }
+      if (op_desc->HasAttr("shape_range_info_path")) {
+        if (config_.shape_range_info_path_.empty()) {
+          op_desc->SetAttr(
+              "shape_range_info_path",
+              model_opt_cache_dir + "/" + "shape_range_info.pbtxt");
+        } else {
+          op_desc->SetAttr("shape_range_info_path",
+                           config_.shape_range_info_path_);
+        }
+      }
     }
   }
 

From 762ae52a616764e23ea0d88b27dfa6decd57750b Mon Sep 17 00:00:00 2001
From: lzydev <lizhiyu02@baidu.com>
Date: Thu, 29 Feb 2024 21:09:28 +0800
Subject: [PATCH 14/55] fix amp pass bug (#62239)

---
 .../distributed/passes/auto_parallel_fp16.py  | 23 ++++---------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 73cad3e3e928c..c1d8c54c6b4b2 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -308,25 +308,10 @@ def resolute_cast_op(self, block):
             if op.type == "cast":
                 in_name = op.input('X')[0]
                 out_name = op.output('Out')[0]
-                if "@GRAD" in in_name:
-                    in_var_fw = block._find_var_recursive(
-                        in_name[: in_name.find("@")]
-                    )
-                    out_var_fw = block._find_var_recursive(
-                        out_name[: out_name.find("@")]
-                    )
-                    op._set_attr('in_dtype', in_var_fw.dtype)
-                    op._set_attr('out_dtype', out_var_fw.dtype)
-
-                    in_var = block._find_var_recursive(in_name)
-                    out_var = block._find_var_recursive(out_name)
-                    in_var.desc.set_dtype(in_var_fw.dtype)
-                    out_var.desc.set_dtype(out_var_fw.dtype)
-                else:
-                    in_var = block._find_var_recursive(in_name)
-                    out_var = block._find_var_recursive(out_name)
-                    op._set_attr("in_dtype", in_var.dtype)
-                    op._set_attr("out_dtype", out_var.dtype)
+                in_var = block._find_var_recursive(in_name)
+                out_var = block._find_var_recursive(out_name)
+                op._set_attr("in_dtype", in_var.dtype)
+                op._set_attr("out_dtype", out_var.dtype)
 
     def resolute_tensor_dtype(self, block):
         for op in block.ops:

From 6470913f2e37ebfc17deefa3e0a61a3261ef36e7 Mon Sep 17 00:00:00 2001
From: liuzhenhai93 <liuzhenhai93@outlook.com>
Date: Thu, 29 Feb 2024 21:36:02 +0800
Subject: [PATCH 15/55] =?UTF-8?q?=E3=80=90auto=20parallel=E3=80=91expand?=
 =?UTF-8?q?=20as=20infer=20spmd=20(#62159)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* expand as infer spmd

* compile

* add test

* polish

* polish
---
 paddle/phi/infermeta/spmd_rules/expand_as.cc  | 86 +++++++++++++++++
 paddle/phi/infermeta/spmd_rules/expand_as.h   | 38 ++++++++
 paddle/phi/infermeta/spmd_rules/rules.cc      | 10 ++
 paddle/phi/infermeta/spmd_rules/rules.h       |  1 +
 .../auto_parallel/static/completion.py        |  1 +
 .../static/operators/__init__.py              |  1 +
 .../static/operators/dist_default.py          | 18 ++--
 .../static/operators/dist_expand_as.py        | 80 ++++++++++++++++
 test/cpp/auto_parallel/CMakeLists.txt         |  3 +
 .../auto_parallel/expand_as_spmd_rule_test.cc | 95 +++++++++++++++++++
 10 files changed, 326 insertions(+), 7 deletions(-)
 create mode 100644 paddle/phi/infermeta/spmd_rules/expand_as.cc
 create mode 100644 paddle/phi/infermeta/spmd_rules/expand_as.h
 create mode 100644 python/paddle/distributed/auto_parallel/static/operators/dist_expand_as.py
 create mode 100644 test/cpp/auto_parallel/expand_as_spmd_rule_test.cc

diff --git a/paddle/phi/infermeta/spmd_rules/expand_as.cc b/paddle/phi/infermeta/spmd_rules/expand_as.cc
new file mode 100644
index 0000000000000..6bd663c826664
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/expand_as.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/expand_as.h"
+
+#include "glog/logging.h"
+#include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+std::tuple<TensorDistAttr, TensorDistAttr> AlignExpandAsDistAttrs(
+    const DistMetaTensor& x, const DistMetaTensor& y) {
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);
+  EXTRACT_SHAPE_AND_DIST_ATTR(y);
+  auto x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  auto y_dist_attr_dst = CopyTensorDistAttrForOutput(y_dist_attr_src);
+  auto x_dims_mapping_dst = x_dims_mapping_src;
+  auto y_dims_mapping_dst = y_dims_mapping_src;
+  int dims_diff = y_ndim - x_ndim;
+  for (int i = 0; i < y_ndim; ++i) {
+    if (i >= dims_diff) {
+      if (x_shape[i - dims_diff] == y_shape[i]) {
+        x_dims_mapping_dst[i - dims_diff] = y_dims_mapping_src[i];
+      } else {
+        x_dims_mapping_dst[i - dims_diff] = -1;
+      }
+    }
+  }
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping_dst);
+  y_dist_attr_dst.set_dims_mapping(y_dims_mapping_dst);
+  LOG_SPMD_INPUT(x);
+  LOG_SPMD_INPUT(y);
+  return {x_dist_attr_dst, y_dist_attr_dst};
+}
+
+SpmdInfo ExpandAsInferSpmd(const DistMetaTensor& x,
+                           const DistMetaTensor& y,
+                           const std::vector<int64_t>& target_shape) {
+  auto [x_dist_attr, y_dist_attr] = AlignExpandAsDistAttrs(x, y);
+  return {{x_dist_attr, y_dist_attr}, {y_dist_attr}};
+}
+
+SpmdInfo ExpandAsInferSpmdReverse(const DistMetaTensor& x,
+                                  const DistMetaTensor& y,
+                                  const DistMetaTensor& output,
+                                  const std::vector<int64_t>& target_shape) {
+  auto [x_dist_attr, y_dist_attr] = AlignExpandAsDistAttrs(x, output);
+  return {{x_dist_attr, y_dist_attr}, {y_dist_attr}};
+}
+
+SpmdInfo ExpandAsGradInferSpmd(const DistMetaTensor& x,
+                               const DistMetaTensor& out_grad,
+                               const std::vector<int64_t>& target_shape) {
+  auto [x_dist_attr, y_dist_attr] = AlignExpandAsDistAttrs(x, out_grad);
+  const auto& x_dims_mapping = x_dist_attr.dims_mapping();
+  const auto& y_dims_mapping = y_dist_attr.dims_mapping();
+
+  // handle partial grad
+  auto x_grad_dist_attr = x_dist_attr;
+  int x_ndims = x_dims_mapping.size();
+  int y_ndims = y_dims_mapping.size();
+  int dims_diff = y_ndims - x_ndims;
+  std::vector<int64_t> partial;
+  for (int i = 0; i < y_ndims; ++i) {
+    if (i < dims_diff || x_dims_mapping[i - dims_diff] != y_dims_mapping[i]) {
+      if (y_dims_mapping[i] >= 0) {
+        partial.push_back(y_dims_mapping[i]);
+      }
+    }
+  }
+  x_grad_dist_attr.set_partial_status(partial);
+  return {{x_dist_attr, y_dist_attr}, {x_grad_dist_attr}};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/expand_as.h b/paddle/phi/infermeta/spmd_rules/expand_as.h
new file mode 100644
index 0000000000000..67cc6f3853dc1
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/expand_as.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+SpmdInfo ExpandAsInferSpmd(const DistMetaTensor& x,
+                           const DistMetaTensor& y,
+                           const std::vector<int64_t>& target_shape);
+
+SpmdInfo ExpandAsInferSpmdReverse(const DistMetaTensor& x,
+                                  const DistMetaTensor& y,
+                                  const DistMetaTensor& output,
+                                  const std::vector<int64_t>& target_shape);
+
+SpmdInfo ExpandAsGradInferSpmd(const DistMetaTensor& x,
+                               const DistMetaTensor& out_grad,
+                               const std::vector<int64_t>& target_shape);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc
index aff1633ee2cba..d8ba17971b6a9 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.cc
+++ b/paddle/phi/infermeta/spmd_rules/rules.cc
@@ -605,6 +605,16 @@ PD_REGISTER_SPMD_RULE(
     PD_INFER_SPMD(
         phi::distributed::FusedLinearParamGradAddInferSpmdFakeReverse));
 
+PD_REGISTER_SPMD_RULE(
+    expand_as,
+    PD_INFER_SPMD(phi::distributed::ExpandAsInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ExpandAsInferSpmdReverse));
+
+PD_REGISTER_SPMD_RULE(
+    expand_as_v2,
+    PD_INFER_SPMD(phi::distributed::ExpandAsInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ExpandAsInferSpmdReverse));
+
 // scatter
 PD_REGISTER_SPMD_RULE(scatter,
                       PD_INFER_SPMD(phi::distributed::ScatterInferSpmd),
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index ed6a6cbb9641c..805d20904c8a5 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
 #include "paddle/phi/infermeta/spmd_rules/embedding.h"
+#include "paddle/phi/infermeta/spmd_rules/expand_as.h"
 #include "paddle/phi/infermeta/spmd_rules/flash_attention.h"
 #include "paddle/phi/infermeta/spmd_rules/flatten.h"
 #include "paddle/phi/infermeta/spmd_rules/full_like.h"
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index 01db8beacb7e4..663cd1afd94a4 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -181,6 +181,7 @@ def _can_apply_infer_spmd_rule(dist_op):
         "unsqueeze2",
         "silu",
         "concat",
+        "expand_as_v2",
     ]
     parallel_ce = os.getenv("PARALLEL_CROSS_ENTROPY")
     if parallel_ce == "true":
diff --git a/python/paddle/distributed/auto_parallel/static/operators/__init__.py b/python/paddle/distributed/auto_parallel/static/operators/__init__.py
index a0415fe4e6b00..93d2c2597e819 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/__init__.py
@@ -21,6 +21,7 @@
     dist_dropout,
     dist_eltwise,
     dist_embedding,
+    dist_expand_as,
     dist_fill_constant_batch_size_like,
     dist_flash_attn,
     dist_fused_attention,
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
index 472621c99cada..85163c57a3baa 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
@@ -49,6 +49,7 @@
     "fill_constant_batch_size_like",
     "fill_constant",
     "expand_v2",
+    "expand_as_v2",
 ]
 
 
@@ -534,12 +535,15 @@ def forward(ctx, *args, **kwargs):
         # replicate op in dist program
         dst_op = copy_op_without_infer_shape(src_op, main_block, ctx, kwargs)
 
-        if (
-            src_op.has_attr('shape')
-            and src_op.attr('shape')
-            and src_op.type in __op_has_shape_attr__
-        ):
-            shape_list = src_op.attr('shape')
+        def get_shape_attr_name():
+            for name in ["shape", "target_shape"]:
+                if src_op.has_attr(name) and src_op.attr(name):
+                    return name
+            return None
+
+        shape_attr_name = get_shape_attr_name()
+        if shape_attr_name and src_op.type in __op_has_shape_attr__:
+            shape_list = src_op.attr(shape_attr_name)
             Out_var = main_block._var_recursive(kwargs['Out'][0])
             op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
             dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
@@ -552,7 +556,7 @@ def forward(ctx, *args, **kwargs):
                         shape_list[idx] = (
                             shape_list[idx] // process_mesh_shape[axis]
                         )
-            dst_op.desc._set_attr('shape', shape_list)
+            dst_op.desc._set_attr(shape_attr_name, shape_list)
 
         # data parallel synchronization for primitive operators
         from paddle.incubate.autograd import prim_enabled
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_expand_as.py b/python/paddle/distributed/auto_parallel/static/operators/dist_expand_as.py
new file mode 100644
index 0000000000000..db592342d6b0f
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_expand_as.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from ..completion import get_phi_spmd_rule
+from ..utils import get_dist_tensor_spec
+from .common import (
+    DistributedOperatorImplContainer,
+    get_default_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+    update_op_dims_mapping,
+)
+
+
+class DistributedExpandAs(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+
+    @staticmethod
+    def update_dims_mapping(dist_op):
+        # step1: prepare inputs need for rule (order args as PHI definition and filter out unnecessary args)
+        op_desc = dist_op.serial_op.desc
+
+        input_arg_names = op_desc.input_arg_names()
+        output_arg_names = op_desc.output_arg_names()
+        target_shape = op_desc.attr('target_shape')
+
+        input_specs = []
+        for name in input_arg_names:
+            input_specs.append(get_dist_tensor_spec(dist_op, name))
+
+        assert len(input_specs) == 2
+
+        output_spec = get_dist_tensor_spec(dist_op, output_arg_names[0], False)
+
+        # step2: infer spmd
+        rule = get_phi_spmd_rule("expand_as")
+        # tensor order following order in PHI definition
+        fw_results = rule.infer_forward(
+            input_specs[0], input_specs[1], target_shape
+        )
+        bw_results = rule.infer_backward(
+            input_specs[0], input_specs[1], output_spec, target_shape
+        )
+
+        # step3: update dist_attr
+        # tensor order following order in PHI definition
+        changed = update_op_dims_mapping(
+            dist_op,
+            input_arg_names,
+            output_arg_names,
+            fw_results,
+            bw_results,
+        )
+
+        return changed
+
+    @staticmethod
+    def mapping_to_dist_operator_impl(dist_op, original_op_dist_attr):
+        op_dist_attr = dist_op.dist_attr
+        default_impl = get_default_distributed_operator_impl()
+        op_dist_attr.impl_type = default_impl.type
+        op_dist_attr.impl_idx = default_impl.idx
+
+        return False
+
+
+register_distributed_operator_impl_container(
+    DistributedExpandAs("expand_as_v2")
+)
diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt
index 2985dffa7da18..2db1baa4da642 100644
--- a/test/cpp/auto_parallel/CMakeLists.txt
+++ b/test/cpp/auto_parallel/CMakeLists.txt
@@ -29,6 +29,9 @@ if(WITH_DISTRIBUTE)
   paddle_test(cross_entropy_softmax_spmd_rule_test SRCS
               cross_entropy_softmax_spmd_rule_test.cc DEPS spmd_rule_test_util)
 
+  paddle_test(expand_as_spmd_rule_test SRCS expand_as_spmd_rule_test.cc DEPS
+              spmd_rule_test_util phi)
+
   paddle_test(custom_op_spmd_rule_test SRCS custom_op_spmd_rule_test.cc DEPS
               spmd_rule_test_util phi)
 
diff --git a/test/cpp/auto_parallel/expand_as_spmd_rule_test.cc b/test/cpp/auto_parallel/expand_as_spmd_rule_test.cc
new file mode 100644
index 0000000000000..ca9daa84f99fd
--- /dev/null
+++ b/test/cpp/auto_parallel/expand_as_spmd_rule_test.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "test/cpp/auto_parallel/spmd_rule_test_util.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+TEST(ExpandAsInferSpmd, Ctor) {
+  // Sharding along axes besides softmax axis.
+  std::vector<int64_t> x_shape = {1, 48};
+  std::vector<int64_t> y_shape = {2, 32, 48};
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  TensorDistAttr x_dist_attr = TensorDistAttr();
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
+  x_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr y_dist_attr = TensorDistAttr();
+  y_dist_attr.set_process_mesh(process_mesh);
+  y_dist_attr.set_dims_mapping(std::vector<int64_t>({0, 1, -1}));
+  y_dist_attr.set_dynamic_dims(std::vector<bool>({false, false, false}));
+
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
+
+  // test info forward
+  auto spmdinfo = ExpandAsInferSpmd(x, y, y_shape);
+  EXPECT_EQ(spmdinfo.first.size(), 2UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({0, 1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({0, 1, -1}));
+  EXPECT_DOUBLE_EQ(
+      PADDLE_GET_CONST(TensorDistAttr, spmdinfo.second[0]).is_partial(), false);
+  VLOG(4) << "Test ExpandAsInferSpmd" << std::endl << std::endl << std::endl;
+
+  // test info reverse
+  spmdinfo = ExpandAsInferSpmdReverse(x, y, y, y_shape);
+  EXPECT_EQ(spmdinfo.first.size(), 2UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({0, 1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({0, 1, -1}));
+  EXPECT_DOUBLE_EQ(
+      PADDLE_GET_CONST(TensorDistAttr, spmdinfo.second[0]).is_partial(), false);
+  VLOG(4) << "Test ExpandAsInferSpmdReverse" << std::endl
+          << std::endl
+          << std::endl;
+
+  // test info grad
+  spmdinfo = ExpandAsGradInferSpmd(x, y, y_shape);
+  EXPECT_EQ(spmdinfo.first.size(), 2UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({0, 1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, -1}));
+  check_partial_dims(spmdinfo.second[0], {0, 1});
+  VLOG(4) << "Test ExpandAsGradInferSpmd" << std::endl
+          << std::endl
+          << std::endl;
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle

From 102c515fb5dd3743e117e64b2a62a60dcc744539 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <eddiezhang@pku.edu.cn>
Date: Thu, 29 Feb 2024 21:51:42 +0800
Subject: [PATCH 16/55] [Dy2St] Delete legacy class TracedLayer and its related
 unit tests (#62227)

---
 python/paddle/jit/api.py                      | 412 +-----------------
 ...imperative_trace_non_persistable_inputs.py | 101 -----
 .../legacy_test/test_op_function_generator.py |   8 -
 test/legacy_test/test_traced_layer_err_msg.py | 272 ------------
 4 files changed, 1 insertion(+), 792 deletions(-)
 delete mode 100644 test/legacy_test/test_imperative_trace_non_persistable_inputs.py
 delete mode 100644 test/legacy_test/test_traced_layer_err_msg.py

diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index fbc562d881a20..f81cb801d14bc 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -30,28 +30,20 @@
 from paddle.base import core, dygraph
 from paddle.base.compiler import (
     BuildStrategy,
-    CompiledProgram,
-    ExecutionStrategy,
 )
-from paddle.base.data_feeder import check_type
 from paddle.base.dygraph.base import (
-    program_desc_tracing_guard,
     switch_to_static_graph,
 )
 from paddle.base.executor import Executor, scope_guard
 from paddle.base.framework import (
-    Block,
     EagerParamBase,
     Parameter,
-    Program,
     Variable,
     _current_expected_place,
-    _dygraph_guard,
-    _dygraph_tracer,
     dygraph_only,
 )
 from paddle.base.wrapped_decorator import wrap_decorator
-from paddle.framework import in_dynamic_mode, use_pir_api
+from paddle.framework import use_pir_api
 from paddle.nn import Layer
 from paddle.static.io import save_inference_model
 from paddle.utils.environments import (
@@ -85,34 +77,6 @@ def sot_mode_guard(value: bool):
         yield
 
 
-def create_program_from_desc(program_desc):
-    program = Program()
-    program.desc = program_desc
-    program.blocks = [Block(program, 0)]
-    program._sync_with_cpp()
-    return program
-
-
-def _extract_vars(inputs, result_list, err_tag='inputs'):
-    if isinstance(inputs, Variable):
-        result_list.append(inputs)
-    elif isinstance(inputs, (list, tuple)):
-        for var in inputs:
-            _extract_vars(var, result_list, err_tag)
-    else:
-        raise TypeError(
-            "The type of 'each element of {}' in paddle.jit.api.TracedLayer.trace must be base.Variable, but received {}.".format(
-                err_tag, type(inputs)
-            )
-        )
-
-
-def extract_vars(inputs, err_tag='inputs'):
-    result_list = []
-    _extract_vars(inputs, result_list, err_tag)
-    return result_list
-
-
 def copy_decorator_attrs(original_func, decorated_obj):
     """
     Copies some necessary attributes from original function into decorated function.
@@ -1524,380 +1488,6 @@ def load(path, **configs):
     return TranslatedLayer._construct(model_path, config)
 
 
-@dygraph_only
-def _trace(
-    layer, inputs, feed_prefix='feed_', fetch_prefix='fetch_', tmp_prefix='t_'
-):
-    assert isinstance(layer, Layer)
-
-    if not isinstance(inputs, (list, tuple)):
-        inputs = [inputs]
-
-    tracer = _dygraph_tracer()._get_program_desc_tracer()
-
-    var_list = extract_vars(inputs)
-
-    with program_desc_tracing_guard(True):
-        original_outputs = layer(*inputs)
-        if not isinstance(original_outputs, (list, tuple)):
-            outputs = [original_outputs]
-        else:
-            outputs = original_outputs
-        out_vars = extract_vars(outputs, err_tag='outputs')
-
-        (
-            program_desc,
-            feed_names,
-            fetch_names,
-            parameters,
-        ) = tracer.create_program_desc(
-            var_list, feed_prefix, out_vars, fetch_prefix, tmp_prefix
-        )
-        tracer.reset()
-
-    with _dygraph_guard(None):
-        program = create_program_from_desc(program_desc)
-
-    return original_outputs, program, feed_names, fetch_names, parameters
-
-
-class TracedLayer:
-    """
-    :api_attr: imperative
-
-    TracedLayer is used to convert a forward dygraph model to a static
-    graph model. This is mainly used to save the dygraph model for online
-    inference using C++. Besides, users can also do inference in Python
-    using the converted static graph model, which usually has better
-    performance than the original dygraph model.
-
-    TracedLayer would run the static graph model using :code:`Executor`
-    and :code:`CompiledProgram` . The static graph model would share
-    parameters with the dygraph model.
-
-    All TracedLayer objects should not be created by constructor and should
-    be created by static method :code:`TracedLayer.trace(layer, inputs)` .
-
-    The TracedLayer can only be used to convert the data-independent dygraph
-    model into the static graph model, which means the dygraph model should
-    be independent with the tensor data and shape.
-    """
-
-    def __init__(self, program, parameters, feed_names, fetch_names):
-        self._program = program
-        self._feed_names = feed_names
-        self._fetch_names = fetch_names
-        self._params = parameters
-
-        self._place = _current_expected_place()
-
-        self._scope = core.Scope()
-        for p in parameters:
-            src_tensor = p.value().get_tensor()
-            dst_tensor = self._scope.var(p.name).get_tensor()
-            dst_tensor._share_data_with(src_tensor)
-
-        self._exe = Executor(self._place)
-        self._compiled_program = None
-        self._build_strategy = None
-        self._exec_strategy = None
-
-    @property
-    def program(self):
-        return self._program
-
-    def _switch(self, is_test=True):
-        for block_id in range(self._program.num_blocks):
-            block = self._program.block(block_id)
-            for op in block.ops:
-                if op.has_attr("is_test"):
-                    op._set_attr("is_test", is_test)
-
-    @staticmethod
-    @dygraph_only
-    def trace(layer, inputs):
-        """
-        This method is the only allowed method to create TracedLayer object.
-        It would call the :code:`layer(*inputs)` method to run the dygraph
-        model and convert it into a static graph model.
-
-        Args:
-            layer (paddle.nn.Layer): the layer object to be traced.
-            inputs (list(Tensor)|tuple(Tensor)|Tensor): the input tensors of
-                the layer object.
-
-        Returns:
-            tuple: A tuple of 2 items, whose the first item is the output of
-                :code:`layer(*inputs)` , and the second item is the created
-                TracedLayer object.
-
-        Examples:
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> class ExampleLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self._fc = paddle.nn.Linear(3, 10)
-                ...
-                ...     def forward(self, input):
-                ...         return self._fc(input)
-
-
-                >>> layer = ExampleLayer()
-                >>> in_var = paddle.uniform(shape=[2, 3], dtype='float32')
-                >>> out_dygraph, static_layer = paddle.jit.api.TracedLayer.trace(layer, inputs=[in_var])
-
-                >>> # run the static graph model using Executor inside
-                >>> out_static_graph = static_layer([in_var])
-
-                >>> print(len(out_static_graph)) # 1
-                >>> print(out_static_graph[0].shape) # (2, 10)
-
-                >>> # save the static graph model for inference
-                >>> static_layer.save_inference_model('./saved_infer_model')
-
-        """
-        assert isinstance(
-            layer, Layer
-        ), "The type of 'layer' in paddle.jit.api.TracedLayer.trace must be paddle.nn.Layer, but received {}.".format(
-            type(layer)
-        )
-        outs, prog, feed, fetch, parameters = _trace(layer, inputs)
-        traced = TracedLayer(prog, parameters, feed, fetch)
-        return outs, traced
-
-    def set_strategy(self, build_strategy=None, exec_strategy=None):
-        """
-        Set the strategies when running static graph model.
-
-        Args:
-            build_strategy (BuildStrategy, optional): build strategy of
-                :code:`CompiledProgram` inside TracedLayer. Default None.
-            exec_strategy (ExecutionStrategy, optional): execution strategy of
-                :code:`CompiledProgram` inside TracedLayer. Default None.
-
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                >>> import paddle
-
-                >>> class ExampleLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self._fc = paddle.nn.Linear(3, 10)
-                ...
-                ...     def forward(self, input):
-                ...         return self._fc(input)
-
-                >>> layer = ExampleLayer()
-                >>> in_var = paddle.uniform(shape=[2, 3], dtype='float32')
-
-                >>> out_dygraph, static_layer = paddle.jit.api.TracedLayer.trace(layer, inputs=[in_var])
-
-                >>> build_strategy = paddle.static.BuildStrategy()
-                >>> build_strategy.enable_inplace = True
-
-                >>> exec_strategy = paddle.static.ExecutionStrategy()
-                >>> exec_strategy.num_threads = 2
-
-                >>> static_layer.set_strategy(build_strategy=build_strategy, exec_strategy=exec_strategy)
-                >>> out_static_graph = static_layer([in_var])
-
-        """
-        assert self._compiled_program is None, "Cannot set strategy after run"
-        assert isinstance(
-            build_strategy, (type(None), BuildStrategy)
-        ), "The type of 'build_strategy' in paddle.jit.api.TracedLayer.set_strategy must be base.BuildStrategy, but received {}.".format(
-            type(build_strategy)
-        )
-        assert isinstance(
-            exec_strategy, (type(None), ExecutionStrategy)
-        ), "The type of 'exec_strategy' in paddle.jit.api.TracedLayer.set_strategy must be base.ExecutionStrategy, but received {}.".format(
-            type(exec_strategy)
-        )
-        self._build_strategy = build_strategy
-        self._exec_strategy = exec_strategy
-
-    @switch_to_static_graph
-    def _compile(self):
-        self._compiled_program = CompiledProgram(
-            self._program,
-            build_strategy=self._build_strategy,
-        )
-
-    def _build_feed(self, inputs):
-        assert isinstance(
-            inputs, (list, tuple)
-        ), "Inputs should be a list or tuple of variables"
-        assert len(inputs) == len(self._feed_names)
-        feed_dict = {}
-        if in_dynamic_mode():
-            for x, name in zip(inputs, self._feed_names):
-                feed_dict[name] = x.value().get_tensor()
-        else:
-            for x, name in zip(inputs, self._feed_names):
-                feed_dict[name] = x
-
-        return feed_dict
-
-    @switch_to_static_graph
-    def _run(self, feed):
-        return self._exe.run(
-            self._compiled_program, feed=feed, fetch_list=self._fetch_names
-        )
-
-    def __call__(self, inputs):
-        with scope_guard(self._scope):
-            if self._compiled_program is None:
-                self._compile()
-
-            return self._run(self._build_feed(inputs))
-
-    @switch_to_static_graph
-    def save_inference_model(self, path, feed=None, fetch=None, **kwargs):
-        """
-        Save the TracedLayer to a model for inference. The saved
-        inference model can be loaded by C++ inference APIs.
-
-        ``path`` is the prefix of saved objects, and the saved translated program file
-        suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` .
-
-        Args:
-            path(str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-            feed (list[int], optional): the input variable indices of the saved
-                inference model. If None, all input variables of the
-                TracedLayer object would be the inputs of the saved inference
-                model. Default None.
-            fetch (list[int], optional): the output variable indices of the
-                saved inference model. If None, all output variables of the
-                TracedLayer object would be the outputs of the saved inference
-                model. Default None.
-            kwargs: Supported keys including
-                - clip_extra(bool): whether to clip extra information for every operator. Defaults to True.
-                - legacy_format(bool): whether to save program in legacy format. Default to False.
-
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                >>> import numpy as np
-                >>> import paddle
-
-                >>> class ExampleLayer(paddle.nn.Layer):
-                ...     def __init__(self):
-                ...         super().__init__()
-                ...         self._fc = paddle.nn.Linear(3, 10)
-                ...
-                ...     def forward(self, input):
-                ...         return self._fc(input)
-
-                >>> save_dirname = './saved_infer_model'
-                >>> in_np = np.random.random([2, 3]).astype('float32')
-                >>> in_var = paddle.to_tensor(in_np)
-                >>> layer = ExampleLayer()
-
-                >>> out_dygraph, static_layer = paddle.jit.api.TracedLayer.trace(layer, inputs=[in_var])
-                >>> static_layer.save_inference_model(save_dirname, feed=[0], fetch=[0])
-
-                >>> paddle.enable_static()
-                >>> place = paddle.CPUPlace()
-                >>> exe = paddle.static.Executor(place)
-                >>> program, feed_vars, fetch_vars = paddle.static.load_inference_model(
-                ...     save_dirname,
-                ...     exe
-                ... )
-
-                >>> fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
-                >>> print(fetch.shape)
-                [2, 10]
-        """
-        check_type(
-            path,
-            "path",
-            str,
-            "paddle.jit.api.TracedLayer.save_inference_model",
-        )
-        check_type(
-            feed,
-            "feed",
-            (type(None), list),
-            "paddle.jit.api.TracedLayer.save_inference_model",
-        )
-        if isinstance(feed, list):
-            for f in feed:
-                check_type(
-                    f,
-                    "each element of feed",
-                    int,
-                    "paddle.jit.api.TracedLayer.save_inference_model",
-                )
-        check_type(
-            fetch,
-            "fetch",
-            (type(None), list),
-            "paddle.jit.api.TracedLayer.save_inference_model",
-        )
-        if isinstance(fetch, list):
-            for f in fetch:
-                check_type(
-                    f,
-                    "each element of fetch",
-                    int,
-                    "paddle.jit.api.TracedLayer.save_inference_model",
-                )
-        clip_extra = kwargs.get('clip_extra', True)
-        # path check
-        file_prefix = os.path.basename(path)
-        if file_prefix == "":
-            raise ValueError(
-                "The input path MUST be format of dirname/file_prefix "
-                "[dirname\\file_prefix in Windows system], but received "
-                "file_prefix is empty string."
-            )
-
-        dirname = os.path.dirname(path)
-        if dirname and not os.path.exists(dirname):
-            os.makedirs(dirname)
-
-        def get_feed_fetch(all_vars, partial_vars):
-            if partial_vars is None:
-                return all_vars
-
-            return [all_vars[idx] for idx in partial_vars]
-
-        with scope_guard(self._scope):
-            feeded_var_names = get_feed_fetch(self._feed_names, feed)
-            target_var_names = get_feed_fetch(self._fetch_names, fetch)
-            feed_vars = []
-            for name in feeded_var_names:
-                feed_var = self._program.global_block().vars.get(name, None)
-                assert feed_var is not None, f"{name} cannot be found"
-                feed_vars.append(feed_var)
-            target_vars = []
-            for name in target_var_names:
-                target_var = self._program.global_block().vars.get(name, None)
-                assert target_var is not None, f"{name} cannot be found"
-                target_vars.append(target_var)
-            legacy_format = kwargs.get('legacy_format', False)
-            file_prefix = os.path.join(dirname, file_prefix)
-            save_inference_model(
-                path_prefix=file_prefix,
-                feed_vars=feed_vars,
-                fetch_vars=target_vars,
-                executor=self._exe,
-                program=self._program.clone(),
-                clip_extra=clip_extra,
-                legacy_format=legacy_format,
-            )
-
-
 def set_dynamic_shape(variable, shape_list):
     if paddle.base.dygraph.base.in_to_static_mode():
         if isinstance(variable, paddle.base.framework.Variable):
diff --git a/test/legacy_test/test_imperative_trace_non_persistable_inputs.py b/test/legacy_test/test_imperative_trace_non_persistable_inputs.py
deleted file mode 100644
index 5238e37df5a5a..0000000000000
--- a/test/legacy_test/test_imperative_trace_non_persistable_inputs.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base
-
-
-class SimpleFCLayer(paddle.nn.Layer):
-    def __init__(self, feature_size, batch_size, fc_size):
-        super().__init__()
-        self._linear = paddle.nn.Linear(feature_size, fc_size)
-        self._offset = paddle.to_tensor(
-            np.random.random((batch_size, fc_size)).astype('float32')
-        )
-
-    def forward(self, x):
-        fc = self._linear(x)
-        return fc + self._offset
-
-
-class TestTracedLayerRecordNonPersistableInput(unittest.TestCase):
-    def test_main(self):
-        if base.framework.in_dygraph_mode():
-            return
-        traced_layer = None
-        with base.dygraph.guard():
-            feature_size = 3
-            batch_size = 4
-            fc_size = 2
-            layer = SimpleFCLayer(feature_size, batch_size, fc_size)
-            optimizer = paddle.optimizer.SGD(
-                learning_rate=1e-3, parameters=layer.parameters()
-            )
-
-            expected_persistable_vars = {
-                layer._linear.weight.name,
-                layer._linear.bias.name,
-                layer._offset.name,
-            }
-
-            for _ in range(10):
-                in_x = paddle.to_tensor(
-                    np.random.random((batch_size, feature_size)).astype(
-                        'float32'
-                    )
-                )
-                if traced_layer is None:
-                    dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                        layer, [in_x]
-                    )
-                else:
-                    dygraph_out = layer(in_x)
-                dygraph_out_numpy = dygraph_out.numpy()
-                static_out = traced_layer([in_x])[0]
-                np.testing.assert_array_equal(dygraph_out_numpy, static_out)
-
-                loss = paddle.mean(dygraph_out)
-                loss.backward()
-
-                optimizer.minimize(loss)
-
-            del layer
-
-        program = traced_layer.program
-        actual_persistable_vars = set()
-        for var in program.list_vars():
-            if var.persistable:
-                actual_persistable_vars.add(var.name)
-
-        self.assertEqual(actual_persistable_vars, expected_persistable_vars)
-
-        traced_layer.save_inference_model(
-            path='./traced_layer_test_non_persistable_vars'
-        )
-        self.assertTrue(
-            'traced_layer_test_non_persistable_vars.pdmodel' in os.listdir('./')
-        )
-        self.assertTrue(
-            'traced_layer_test_non_persistable_vars.pdiparams'
-            in os.listdir('./')
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_op_function_generator.py b/test/legacy_test/test_op_function_generator.py
index c37dd56c6a98a..d34d0aff45edd 100644
--- a/test/legacy_test/test_op_function_generator.py
+++ b/test/legacy_test/test_op_function_generator.py
@@ -21,14 +21,6 @@
 from paddle import _legacy_C_ops, base
 
 
-class TestTracedLayer(paddle.nn.Layer):
-    def __init__(self, name_scope):
-        super().__init__(name_scope)
-
-    def forward(self, input):
-        return _legacy_C_ops.relu(input)
-
-
 class TestVariable(unittest.TestCase):
     def setUp(self):
         self.shape = [512, 768]
diff --git a/test/legacy_test/test_traced_layer_err_msg.py b/test/legacy_test/test_traced_layer_err_msg.py
deleted file mode 100644
index 4927fdea82a54..0000000000000
--- a/test/legacy_test/test_traced_layer_err_msg.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import base, nn
-
-
-class SimpleFCLayer(nn.Layer):
-    def __init__(self, feature_size, batch_size, fc_size):
-        super().__init__()
-        self._linear = nn.Linear(feature_size, fc_size)
-        self._offset = paddle.to_tensor(
-            np.random.random((batch_size, fc_size)).astype('float32')
-        )
-
-    def forward(self, x):
-        fc = self._linear(x)
-        return fc + self._offset
-
-
-class LinearNetWithNone(nn.Layer):
-    def __init__(self, feature_size, fc_size):
-        super().__init__()
-        self._linear = nn.Linear(feature_size, fc_size)
-
-    def forward(self, x):
-        fc = self._linear(x)
-
-        return [fc, [None, 2]]
-
-
-class TestTracedLayerErrMsg(unittest.TestCase):
-    def setUp(self):
-        self.batch_size = 4
-        self.feature_size = 3
-        self.fc_size = 2
-        self.layer = self._train_simple_net()
-        self.type_str = 'class'
-        self.temp_dir = tempfile.TemporaryDirectory()
-
-    def tearDown(self):
-        self.temp_dir.cleanup()
-
-    def test_trace_err(self):
-        if base.framework.in_dygraph_mode():
-            return
-        with base.dygraph.guard():
-            in_x = paddle.to_tensor(
-                np.random.random((self.batch_size, self.feature_size)).astype(
-                    'float32'
-                )
-            )
-
-            with self.assertRaises(AssertionError) as e:
-                dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                    None, [in_x]
-                )
-            self.assertEqual(
-                "The type of 'layer' in paddle.jit.TracedLayer.trace must be paddle.nn.Layer, but received <{} 'NoneType'>.".format(
-                    self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(TypeError) as e:
-                dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                    self.layer, 3
-                )
-            self.assertEqual(
-                "The type of 'each element of inputs' in paddle.jit.TracedLayer.trace must be base.Variable, but received <{} 'int'>.".format(
-                    self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(TypeError) as e:
-                dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                    self.layer, [True, 1]
-                )
-            self.assertEqual(
-                "The type of 'each element of inputs' in paddle.jit.TracedLayer.trace must be base.Variable, but received <{} 'bool'>.".format(
-                    self.type_str
-                ),
-                str(e.exception),
-            )
-
-            dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                self.layer, [in_x]
-            )
-
-    def test_set_strategy_err(self):
-        if base.framework.in_dygraph_mode():
-            return
-        with base.dygraph.guard():
-            in_x = paddle.to_tensor(
-                np.random.random((self.batch_size, self.feature_size)).astype(
-                    'float32'
-                )
-            )
-            dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                self.layer, [in_x]
-            )
-
-            with self.assertRaises(AssertionError) as e:
-                traced_layer.set_strategy(1, base.ExecutionStrategy())
-            self.assertEqual(
-                "The type of 'build_strategy' in paddle.jit.TracedLayer.set_strategy must be base.BuildStrategy, but received <{} 'int'>.".format(
-                    self.type_str
-                ),
-                str(e.exception),
-            )
-
-            with self.assertRaises(AssertionError) as e:
-                traced_layer.set_strategy(base.BuildStrategy(), False)
-            self.assertEqual(
-                "The type of 'exec_strategy' in paddle.jit.TracedLayer.set_strategy must be base.ExecutionStrategy, but received <{} 'bool'>.".format(
-                    self.type_str
-                ),
-                str(e.exception),
-            )
-
-            traced_layer.set_strategy(build_strategy=base.BuildStrategy())
-            traced_layer.set_strategy(exec_strategy=base.ExecutionStrategy())
-            traced_layer.set_strategy(
-                base.BuildStrategy(), base.ExecutionStrategy()
-            )
-
-    def test_save_inference_model_err(self):
-        if base.framework.in_dygraph_mode():
-            return
-        with base.dygraph.guard():
-            in_x = paddle.to_tensor(
-                np.random.random((self.batch_size, self.feature_size)).astype(
-                    'float32'
-                )
-            )
-            dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                self.layer, [in_x]
-            )
-
-            path = os.path.join(self.temp_dir.name, './traced_layer_err_msg')
-            with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model([0])
-            self.assertEqual(
-                "The type of 'path' in paddle.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".format(
-                    self.type_str, self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(path, [0], [None])
-            self.assertEqual(
-                "The type of 'each element of fetch' in paddle.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".format(
-                    self.type_str, self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(path, [0], False)
-            self.assertEqual(
-                "The type of 'fetch' in paddle.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".format(
-                    self.type_str, self.type_str, self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(path, [None], [0])
-            self.assertEqual(
-                "The type of 'each element of feed' in paddle.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".format(
-                    self.type_str, self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(path, True, [0])
-            self.assertEqual(
-                "The type of 'feed' in paddle.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".format(
-                    self.type_str, self.type_str, self.type_str
-                ),
-                str(e.exception),
-            )
-            with self.assertRaises(ValueError) as e:
-                traced_layer.save_inference_model("")
-            self.assertEqual(
-                "The input path MUST be format of dirname/file_prefix [dirname\\file_prefix in Windows system], "
-                "but received file_prefix is empty string.",
-                str(e.exception),
-            )
-
-            traced_layer.save_inference_model(path)
-
-    def _train_simple_net(self):
-        layer = None
-        with base.dygraph.guard():
-            layer = SimpleFCLayer(
-                self.feature_size, self.batch_size, self.fc_size
-            )
-            optimizer = paddle.optimizer.SGD(
-                learning_rate=1e-3, parameters=layer.parameters()
-            )
-
-            for i in range(5):
-                in_x = paddle.to_tensor(
-                    np.random.random(
-                        (self.batch_size, self.feature_size)
-                    ).astype('float32')
-                )
-                dygraph_out = layer(in_x)
-                loss = paddle.mean(dygraph_out)
-                loss.backward()
-                optimizer.minimize(loss)
-        return layer
-
-
-class TestOutVarWithNoneErrMsg(unittest.TestCase):
-    def test_linear_net_with_none(self):
-        if base.framework.in_dygraph_mode():
-            return
-        model = LinearNetWithNone(100, 16)
-        in_x = paddle.to_tensor(np.random.random((4, 100)).astype('float32'))
-        with self.assertRaises(TypeError):
-            dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                model, [in_x]
-            )
-
-
-class TestTracedLayerSaveInferenceModel(unittest.TestCase):
-    """test save_inference_model will automatically create non-exist dir"""
-
-    def setUp(self):
-        self.temp_dir = tempfile.TemporaryDirectory()
-        self.save_path = os.path.join(self.temp_dir.name, "./nonexist_dir/fc")
-        import shutil
-
-        if os.path.exists(os.path.dirname(self.save_path)):
-            shutil.rmtree(os.path.dirname(self.save_path))
-
-    def tearDown(self):
-        self.temp_dir.cleanup()
-
-    def test_mkdir_when_input_path_non_exist(self):
-        if base.framework.in_dygraph_mode():
-            return
-        fc_layer = SimpleFCLayer(3, 4, 2)
-        input_var = paddle.to_tensor(np.random.random([4, 3]).astype('float32'))
-        with base.dygraph.guard():
-            dygraph_out, traced_layer = base.dygraph.TracedLayer.trace(
-                fc_layer, inputs=[input_var]
-            )
-            self.assertFalse(os.path.exists(os.path.dirname(self.save_path)))
-            traced_layer.save_inference_model(self.save_path)
-            self.assertTrue(os.path.exists(os.path.dirname(self.save_path)))
-
-
-if __name__ == '__main__':
-    unittest.main()

From c6be4727b1747f204455b919a77ac3ac9e8ec880 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 29 Feb 2024 22:44:16 +0800
Subject: [PATCH 17/55] [PIR] Fix dce pass for not eliminated completely
 (#62242)

---
 paddle/fluid/pir/transforms/dead_code_elimination_pass.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pir/transforms/dead_code_elimination_pass.cc b/paddle/fluid/pir/transforms/dead_code_elimination_pass.cc
index 442aec918e08f..d802a470e86f1 100644
--- a/paddle/fluid/pir/transforms/dead_code_elimination_pass.cc
+++ b/paddle/fluid/pir/transforms/dead_code_elimination_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/transforms/dead_code_elimination_pass.h"
+#include <cstdint>
 
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
@@ -31,7 +32,12 @@ class DeadCodeEliminationPass : public pir::Pass {
   void Run(pir::Operation* op) override {
     VLOG(6) << "apply dead_code_elimination_pass";
     int64_t num_erasers{0};
-    EraseOp(*op->GetParentProgram()->block(), &num_erasers);
+    bool updated{true};
+    while (updated) {
+      int64_t pre_num_erasers = num_erasers;
+      EraseOp(*op->GetParentProgram()->block(), &num_erasers);
+      updated = pre_num_erasers != num_erasers;
+    }
     AddStatistics(num_erasers);
   }
 

From 4e0779cbfc025e0b46068e291bbcee42371dd771 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:24:07 +0800
Subject: [PATCH 18/55]  Fix CPUAPlace CPUPlace, etc (#62214)

---
 paddle/fluid/platform/collective_helper.cc    |  4 ++--
 paddle/fluid/platform/device_event_base.cc    |  6 ++---
 paddle/fluid/platform/device_event_cpu.h      |  2 +-
 paddle/fluid/platform/device_event_test.cc    |  4 ++--
 .../platform/profiler/chrometracing_logger.cc |  2 +-
 .../platform/profiler/chrometracing_logger.h  |  2 +-
 .../profiler/dump/deserialization_reader.cc   | 12 +++++-----
 .../profiler/dump/serialization_logger.h      |  2 +-
 .../fluid/platform/profiler/event_tracing.h   |  2 +-
 paddle/fluid/platform/profiler/profiler.cc    | 24 +++++++++----------
 paddle/fluid/platform/profiler/utils.cc       |  8 +++----
 paddle/fluid/platform/profiler_helper.h       |  2 +-
 12 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 4ffcf53b1a574..3444f71639b46 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -183,7 +183,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
       VLOG(1) << "ncclCommInitRank: " << i;
     }
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
-    VLOG(1) << "nccl group end seccessss";
+    VLOG(1) << "nccl group end success";
   }
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id),
                     0,
@@ -261,7 +261,7 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(
             platform::CUDAPlace(dev_id)));
     dev_ctx->set_nccl_comm(comm);
   }
-  VLOG(4) << "add mccl comm: " << comm_map_[ring_id][dev_id].get()
+  VLOG(4) << "add nccl comm: " << comm_map_[ring_id][dev_id].get()
           << ", ring_id:" << ring_id << ", dev_id:" << dev_id;
   return comm_map_[ring_id][dev_id].get();
 }
diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc
index cd2d31f1fbefb..6079691fe873c 100644
--- a/paddle/fluid/platform/device_event_base.cc
+++ b/paddle/fluid/platform/device_event_base.cc
@@ -66,9 +66,9 @@ void DeviceEventRecordCPU(DeviceEvent* event, const DeviceContext* context) {
   auto* wrapper = static_cast<CPUDeviceEventWrapper*>(event->GetEvent().get());
 
   std::unique_lock<std::mutex> lock(wrapper->mutex_);
-  // NOTE: As for CudaEvent_t, it can be used to Record() repeatly. CudaEvent_t
-  // internally reset its status from finished into initialized.
-  // So we simulate the process here.
+  // NOTE: As for CudaEvent_t, it can be used to Record() repeatedly.
+  // CudaEvent_t internally reset its status from finished into initialized. So
+  // we simulate the process here.
   if (wrapper->status_.load() == EventStatus::SUCCESS) {
     VLOG(3) << "Found EventStatus is SUCCESS before RecordCPU. Reset it into "
                "INITIALIZED.";
diff --git a/paddle/fluid/platform/device_event_cpu.h b/paddle/fluid/platform/device_event_cpu.h
index 9490d5f3ceec8..e6faeb5fd01a4 100644
--- a/paddle/fluid/platform/device_event_cpu.h
+++ b/paddle/fluid/platform/device_event_cpu.h
@@ -30,7 +30,7 @@ struct CPUDeviceEventWrapper {
         platform::is_cpu_place(place),
         true,
         platform::errors::PreconditionNotMet(
-            "Required device shall be CPUAPlace, but received %d. ", place));
+            "Required device shall be CPUPlace, but received %d. ", place));
   }
   std::mutex mutex_;
   std::condition_variable cv_completed_;
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index b2e3d3242d219..4eb0da7740f3a 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -63,7 +63,7 @@ TEST(DeviceEvent, CUDA) {
   status = event.Query();
   ASSERT_EQ(status, false);  // async
 
-  event.Wait(kCPU, context);  // step 3. EventSynchornize
+  event.Wait(kCPU, context);  // step 3. EventSynchronize
   status = event.Query();
   ASSERT_EQ(status, true);  // sync
 
@@ -114,7 +114,7 @@ TEST(DeviceEvent, CUDA) {
   status = event.Query();
   ASSERT_EQ(status, false);  // async
 
-  event.Wait(kCPU, context);  // step 3. EventSynchornize
+  event.Wait(kCPU, context);  // step 3. EventSynchronize
   status = event.Query();
   ASSERT_EQ(status, true);  // sync
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index de8fd01a1e59d..87fbe61979876 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -788,7 +788,7 @@ void ChromeTracingLogger::RefineDisplayName(
     "name": "process_name", "pid": %lld, "tid": %lld,
     "ph": "M",
     "args": {
-      "name": "Deivce %lld (%s)"
+      "name": "Device %lld (%s)"
     }
   },
    {
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 37323d1450bf2..89808bee842df 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -57,7 +57,7 @@ class ChromeTracingLogger : public BaseLogger {
   void RefineDisplayName(std::unordered_map<std::string, std::string>);
   std::string filename_;
   std::ofstream output_file_stream_;
-  static const char* categary_name_[];
+  static const char* category_name_[];
   std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
   std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
   uint64_t start_time_;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index 329c9f6871461..f02496ed5d082 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -44,12 +44,12 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
     return nullptr;
   }
   // restore extra info
-  ExtraInfo extrainfo;
+  ExtraInfo extra_info;
   for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) {
     ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx);
-    extrainfo.AddExtraInfo(extra_info_map.key(),
-                           std::string("%s"),
-                           extra_info_map.value().c_str());
+    extra_info.AddExtraInfo(extra_info_map.key(),
+                            std::string("%s"),
+                            extra_info_map.value().c_str());
   }
 
   // restore NodeTrees
@@ -139,10 +139,10 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
         RestoreDeviceProperty(device_property_proto);
   }
   ProfilerResult* profiler_result_ptr =
-      new ProfilerResult(std::move(tree), extrainfo, device_property_map);
+      new ProfilerResult(std::move(tree), extra_info, device_property_map);
 #else
   ProfilerResult* profiler_result_ptr =
-      new ProfilerResult(std::move(tree), extrainfo);
+      new ProfilerResult(std::move(tree), extra_info);
 #endif
   // restore version and span indx
   profiler_result_ptr->SetVersion(node_trees_proto_->version());
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 80d5413106ded..e61ed701cd798 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-// Dump a NodeTrees into a profobuf file.
+// Dump a NodeTrees into a protobuf file.
 // A SerializationLogger object can only dump a NodeTrees object,
 // creates a file in the constructor and closes the file in the destructor.
 // Should only call LogNodeTrees and LogMetaInfo.
diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
index 08890f1369733..b427a9ba55210 100644
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
@@ -28,7 +28,7 @@ namespace platform {
 // Chrome Trace Viewer Format: Instant Event
 struct RecordInstantEvent {
   /**
-   * @param name: It is the caller's reponsibility to manage the underlying
+   * @param name: It is the caller's responsibility to manage the underlying
    * storage. RecordInstantEvent stores the pointer.
    * @param type: Classification which is used to instruct the profiling
    * data statistics.
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index bcb35f5b7bd35..c9d458b1d250a 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -148,19 +148,19 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                     collector.MemEvents(),
                     collector.OperatorSupplementEvents()));
   cpu_utilization_.RecordEndTimeInfo();
-  ExtraInfo extrainfo;
-  extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
-                         std::string("%f"),
-                         cpu_utilization_.GetCpuUtilization());
-  extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"),
-                         std::string("%f"),
-                         cpu_utilization_.GetCpuCurProcessUtilization());
+  ExtraInfo extra_info;
+  extra_info.AddExtraInfo(std::string("System Cpu Utilization"),
+                          std::string("%f"),
+                          cpu_utilization_.GetCpuUtilization());
+  extra_info.AddExtraInfo(std::string("Process Cpu Utilization"),
+                          std::string("%f"),
+                          cpu_utilization_.GetCpuCurProcessUtilization());
   const std::unordered_map<uint64_t, std::string> thread_names =
       collector.ThreadNames();
   for (const auto& kv : thread_names) {
-    extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
-                           std::string("%s"),
-                           kv.second.c_str());
+    extra_info.AddExtraInfo(string_format(std::string("%llu"), kv.first),
+                            std::string("%s"),
+                            kv.second.c_str());
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
@@ -170,10 +170,10 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
     device_property_map[device_id] = device_property;
   }
   ProfilerResult* profiler_result_ptr = new platform::ProfilerResult(
-      std::move(tree), extrainfo, device_property_map);
+      std::move(tree), extra_info, device_property_map);
 #else
   ProfilerResult* profiler_result_ptr =
-      new platform::ProfilerResult(std::move(tree), extrainfo);
+      new platform::ProfilerResult(std::move(tree), extra_info);
 #endif
   profiler_result_ptr->SetVersion(std::string(version));
   profiler_result_ptr->SetSpanIndx(span_indx);
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index 46a94e7fcb23c..8c12f84416579 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -145,16 +145,16 @@ float CalculateEstOccupancy(uint32_t DeviceId,
 #endif  // PADDLE_WITH_CUPTI
 
 const char* StringTracerMemEventType(TracerMemEventType type) {
-  static const char* categary_name_[] = {// NOLINT
+  static const char* category_name_[] = {// NOLINT
                                          "Allocate",
                                          "Free",
                                          "ReservedAllocate",
                                          "ReservedFree"};
-  return categary_name_[static_cast<int>(type)];
+  return category_name_[static_cast<int>(type)];
 }
 
 const char* StringTracerEventType(TracerEventType type) {
-  static const char* categary_name_[] = {"Operator",  // NOLINT
+  static const char* category_name_[] = {"Operator",  // NOLINT
                                          "Dataloader",
                                          "ProfileStep",
                                          "CudaRuntime",
@@ -169,7 +169,7 @@ const char* StringTracerEventType(TracerEventType type) {
                                          "Communication",
                                          "PythonOp",
                                          "PythonUserDefined"};
-  return categary_name_[static_cast<int>(type)];
+  return category_name_[static_cast<int>(type)];
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 8ce6fee8a5f6e..f79b801f1a095 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -740,7 +740,7 @@ void AnalyzeEvent(
     size_t *max_name_width,
     OverHead *overhead,
     bool merge_thread) {
-  // In oreder to deal with special event in main thread
+  // In order to deal with special event in main thread
   std::set<std::string> main_thread_event_name;
   for (size_t i = 0; i < (*analyze_events).size(); i++) {
     for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {

From 7921a77a83c51b14fa3ca2a123fcb02b77fce683 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:25:09 +0800
Subject: [PATCH 19/55]  Fix precison_mode precision_mode, etc (#62212)

---
 .../transforms/auto_mixed_precision_pass.cc   |  4 +--
 .../fusion/conv2d_add_act_fuse_pass.cc        |  4 +--
 .../fused_linear_param_grad_add_pass.cc       | 28 +++++++++----------
 .../fusion/fused_weight_only_linear_pass.cc   |  6 ++--
 .../pir/transforms/sub_graph_detector.cc      | 10 +++----
 .../fluid/pir/transforms/sub_graph_detector.h |  2 +-
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
index 4f5c4c0e4cd6b..dee9aad09ed1d 100644
--- a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
@@ -66,7 +66,7 @@ class AutoMixedPrecisionPass : public pir::Pass {
                "Use Set method to set the place attribute.");
     IR_ENFORCE(Has("__mixed_precision_mode__"),
                "Pass initialize failed."
-               "When using AutoMixedPrecisionPass, precison_mode attribute is "
+               "When using AutoMixedPrecisionPass, precision_mode attribute is "
                "required!"
                "Use Set method to set the scope attribute.");
 
@@ -224,7 +224,7 @@ class AutoMixedPrecisionPass : public pir::Pass {
           precision_updated = true;
         }
         if (!OpRunLowPrecision(op)) continue;
-        // if the producer's output is in float VectorType, then the precsion
+        // if the producer's output is in float VectorType, then the precision
         // between two op should be the same
         for (size_t idx = 0; idx < op->num_operands(); ++idx) {
           if (!op->operand_source(idx)) continue;
diff --git a/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
index 9e950dc2d11b9..4968ae9744248 100644
--- a/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
@@ -207,7 +207,7 @@ class Conv2dAddActFusePass : public pir::PatternRewritePass {
             1,
             std::vector<std::string>{
                 paddle::dialect::FusedConv2dAddActOp::name()});
-    auto conv2d_doublue_add_act_fuse_pattern =
+    auto conv2d_double_add_act_fuse_pattern =
         std::make_unique<Conv2dAdd2ActFusePattern>(
             context,
             1,
@@ -215,7 +215,7 @@ class Conv2dAddActFusePass : public pir::PatternRewritePass {
                 paddle::dialect::FusedConv2dAddActOp::name()});
 
     // conv2d+add+add+act->fused_conv2d_add_act
-    ps.Add(std::move(conv2d_doublue_add_act_fuse_pattern));
+    ps.Add(std::move(conv2d_double_add_act_fuse_pattern));
     // conv2d+add+act->fused_conv2d_add_act
     ps.Add(std::move(conv2d_add_act_fuse_pattern));
     return ps;
diff --git a/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc
index 120b882a67194..074d2d1acb009 100644
--- a/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc
@@ -67,7 +67,7 @@ class FusedMatmulAddGradAddPattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("weight_grad")));
@@ -78,7 +78,7 @@ class FusedMatmulAddGradAddPattern : public paddle::drr::DrrPatternBase {
                                  {"transpose_y", res.BoolAttr(true)}});
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(true)}}});
 
     matmul({&res.Tensor("fwd_add_out_grad"), &res.Tensor("weight")},
@@ -122,7 +122,7 @@ class FusedMatmulGradAddPattern : public paddle::drr::DrrPatternBase {
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
 
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("weight_grad")));
@@ -133,7 +133,7 @@ class FusedMatmulGradAddPattern : public paddle::drr::DrrPatternBase {
                                  {"transpose_y", res.BoolAttr(true)}});
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(false)}}});
 
     matmul({&res.Tensor("out_grad"), &res.Tensor("weight")},
@@ -194,7 +194,7 @@ class FusedMatmulReshapeMatmulAddPattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("w_grad")));
@@ -202,7 +202,7 @@ class FusedMatmulReshapeMatmulAddPattern : public paddle::drr::DrrPatternBase {
 
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(false)}}});
 
     fused_linear_param_grad_add(
@@ -239,7 +239,7 @@ class FusedMatmulAddaPattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("weight_grad")));
@@ -247,7 +247,7 @@ class FusedMatmulAddaPattern : public paddle::drr::DrrPatternBase {
 
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(false)}}});
     fused_linear_param_grad_add(
         {&res.Tensor("x"),
@@ -283,7 +283,7 @@ class FusedMatmulAddbPattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("weight_grad")));
@@ -291,7 +291,7 @@ class FusedMatmulAddbPattern : public paddle::drr::DrrPatternBase {
 
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(false)}}});
     fused_linear_param_grad_add(
         {&res.Tensor("x"),
@@ -341,7 +341,7 @@ class FusedMatmulAddGradAddaPattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("weight_grad")));
@@ -349,7 +349,7 @@ class FusedMatmulAddGradAddaPattern : public paddle::drr::DrrPatternBase {
 
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(true)}}});
     fused_linear_param_grad_add(
         {&res.Tensor("x"),
@@ -399,14 +399,14 @@ class FusedMatmulAddGradAddbPattern : public paddle::drr::DrrPatternBase {
     });
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
-    const auto &muti_precision_attr =
+    const auto &multi_precision_attr =
         res.ComputeAttr([](const paddle::drr::MatchContext &match_ctx) -> bool {
           return !(pir::GetDataTypeFromValue(match_ctx.Tensor("dweight")) ==
                    pir::GetDataTypeFromValue(match_ctx.Tensor("weight_grad")));
         });
     const auto &fused_linear_param_grad_add =
         res.Op(paddle::dialect::FusedLinearParamGradAddOp::name(),
-               {{{"multi_precision", muti_precision_attr},
+               {{{"multi_precision", multi_precision_attr},
                  {"has_bias", res.BoolAttr(true)}}});
     fused_linear_param_grad_add(
         {&res.Tensor("x"),
diff --git a/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc
index bf4ea92af67b2..fc415c3852e38 100644
--- a/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc
@@ -123,9 +123,9 @@ class FusedWeightOnlyLinearPass : public pir::PatternRewritePass {
   }
 
   bool CanApplyOn(pir::Operation *op) const override {
-    int sm_vesion = getSMVersion();
-    if (sm_vesion != 70 && sm_vesion != 75 && sm_vesion != 80 &&
-        sm_vesion != 86) {
+    int sm_version = getSMVersion();
+    if (sm_version != 70 && sm_version != 75 && sm_version != 80 &&
+        sm_version != 86) {
       return false;
     }
     return op->num_regions() > 0;
diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.cc b/paddle/fluid/pir/transforms/sub_graph_detector.cc
index 0690bc1c8399c..0e9547f7642c7 100644
--- a/paddle/fluid/pir/transforms/sub_graph_detector.cc
+++ b/paddle/fluid/pir/transforms/sub_graph_detector.cc
@@ -316,11 +316,11 @@ bool SubgraphDetector::FuseSubGraph(SubGraphPtr subgraph_ptr) {
     if (!consumer->substitute) {
       continue;
     }
-    // fast depency check.
+    // fast dependency check.
     if (IsDependencySimplify(producer, consumer, consumers)) {
       continue;
     }
-    // global depency check.
+    // global dependency check.
     if (IsDependency(producer, consumer, consumers)) {
       continue;
     }
@@ -341,7 +341,7 @@ bool SubgraphDetector::FuseSubGraph(SubGraphPtr subgraph_ptr) {
         producer->ops.end(), candidate->ops.begin(), candidate->ops.end());
     producer->op_set.insert(candidate->op_set.begin(), candidate->op_set.end());
 
-    // update bound for check depency
+    // update bound for check dependency
     producer->max_depth = std::max(producer->max_depth, candidate->max_depth);
     producer->min_depth = std::min(producer->min_depth, candidate->min_depth);
 
@@ -364,7 +364,7 @@ bool SubgraphDetector::FuseSubGraph(SubGraphPtr subgraph_ptr) {
       tmp->producers.erase(candidate);
     }
 
-    // remove candicate in producer/consumer
+    // remove candidate in producer/consumer
     producer->producers.erase(candidate);
     producer->consumers.erase(candidate);
 
@@ -387,7 +387,7 @@ bool SubgraphDetector::FuseSubGraph(SubGraphPtr subgraph_ptr) {
 
   return true;
 }
-// check exist depency.
+// check exist dependency.
 bool SubgraphDetector::IsDependency(
     const SubGraphPtr& producer_g,
     const SubGraphPtr& consumer,
diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.h b/paddle/fluid/pir/transforms/sub_graph_detector.h
index 1b7ec2bc5da6a..424855b02ddcc 100644
--- a/paddle/fluid/pir/transforms/sub_graph_detector.h
+++ b/paddle/fluid/pir/transforms/sub_graph_detector.h
@@ -51,7 +51,7 @@ class SubgraphDetector {
   void DoSubGraphFusion();
 
   bool FuseSubGraph(SubGraphPtr subgraph_ptr);
-  // check exist depency.
+  // check exist dependency.
   bool IsDependency(const SubGraphPtr& producer_g,
                     const SubGraphPtr& consumer,
                     const std::unordered_set<SubGraphPtr>& consumers);

From 4bebcfe53bff5d6e7fd1d350db06d91814043530 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:25:37 +0800
Subject: [PATCH 20/55]  Fix transfrom transform, etc (#62183)

---
 paddle/fluid/operators/pull_gpups_sparse_op.h             | 4 ++--
 paddle/fluid/operators/py_func_op.cc                      | 2 +-
 paddle/fluid/operators/randperm_op.h                      | 6 +++---
 paddle/fluid/operators/read_file_op.cc                    | 2 +-
 paddle/fluid/operators/repeat_interleave_op.cc            | 4 ++--
 paddle/fluid/operators/reshape_op.cc                      | 2 +-
 paddle/fluid/operators/split_op.cc                        | 2 +-
 paddle/fluid/operators/sum_op.cc                          | 2 +-
 paddle/fluid/operators/svd_helper.h                       | 8 ++++----
 paddle/fluid/operators/tdm_sampler_op.h                   | 4 ++--
 paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc | 2 +-
 paddle/fluid/operators/tile_op.cc                         | 2 +-
 paddle/fluid/operators/top_k_op.h                         | 2 +-
 paddle/fluid/operators/top_k_op_xpu.cc                    | 2 +-
 paddle/fluid/operators/transfer_layout_op.h               | 6 +++---
 paddle/fluid/operators/transpose_op.cc                    | 2 +-
 .../fluid/prim/utils/static/composite_grad_desc_maker.h   | 2 +-
 17 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index d8fdadd99cbd4..e5e08cfdde685 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -30,7 +30,7 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
   auto embedding_size_vec = ctx.Attr<std::vector<int>>("size");
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
-  // GpuPSPS only supports float now
+  // GpuPS only supports float now
   std::vector<float *> all_values(slot_size);
   std::vector<int64_t> slot_lengths(slot_size);
   for (size_t i = 0; i < slot_size; i++) {
@@ -80,7 +80,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
                         cur_batch_size,
                         platform::errors::PreconditionNotMet(
                             "The batch size of all input slots should be same, "
-                            "please cheack"));
+                            "please check"));
     }
     const float *grad_value = d_output[i]->data<float>();
     all_grad_values[i] = grad_value;
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index ecdded21bb3e6..7d9c8ceca4943 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -119,7 +119,7 @@ static void CallPythonFunc(py::object *callable,
       out->ShareDataWith(*py_out_tensor);
     } catch (py::cast_error &) {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "py::cast to phi::DenseTensor error. The %d-th output expection is "
+          "py::cast to phi::DenseTensor error. The %d-th output exception is "
           "phi::DenseTensor",
           i));
     }
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 96981a4728402..560fdeb42eaa3 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -29,7 +29,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-static inline void random_permate(T* data_ptr, int num, unsigned int seed) {
+static inline void random_permute(T* data_ptr, int num, unsigned int seed) {
   auto engine = phi::GetCPURandomEngine(seed);
   for (int i = 0; i < num; ++i) {
     data_ptr[i] = static_cast<T>(i);
@@ -50,13 +50,13 @@ class RandpermKernel : public framework::OpKernel<T> {
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
       T* out_data = out_tensor->mutable_data<T>(platform::CPUPlace());
-      random_permate<T>(out_data, n, seed);
+      random_permute<T>(out_data, n, seed);
 
     } else {
       phi::DenseTensor tmp_tensor;
       tmp_tensor.Resize(common::make_ddim({n}));
       T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
-      random_permate<T>(tmp_data, n, seed);
+      random_permute<T>(tmp_data, n, seed);
       framework::TensorCopy(tmp_tensor, ctx.GetPlace(), out_tensor);
     }
   }
diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc
index c19d0a6344ce5..a65b51d24e245 100644
--- a/paddle/fluid/operators/read_file_op.cc
+++ b/paddle/fluid/operators/read_file_op.cc
@@ -46,7 +46,7 @@ class ReadFileOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 This operator read a file.
 )DOC");
-    AddAttr<std::string>("filename", "Path of the file to be readed.")
+    AddAttr<std::string>("filename", "Path of the file to be read.")
         .SetDefault({});
   }
 };
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index 15b4b80cb739b..d0af82510bdc4 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -77,7 +77,7 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
     } else if (repeats > 0) {
       output_dim[dim] = input_dim[dim] * repeats;
     }
-    VLOG(3) << "infershap out " << output_dim[dim];
+    VLOG(3) << "infershape out " << output_dim[dim];
     ctx->SetOutputDim("Out", common::make_ddim(output_dim));
     auto type = ctx->GetInputsVarType("X")[0];
     if (type == framework::proto::VarType::LOD_TENSOR) {
@@ -124,7 +124,7 @@ class RepeatInterleaveOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor) the input tensor.");
     AddInput("RepeatsTensor",
-             "the 1-D tensor containing the repeats alongsize the axis.")
+             "the 1-D tensor containing the repeats alongside the axis.")
         .AsDispensable();
     AddOutput("Out", "the output tensor.");
     AddAttr<int>("Repeats", "the number of repetitions for each element.")
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 822eaf514bac5..34d80604ae8b0 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -581,7 +581,7 @@ class Reshape2CompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
 
     auto *dx_ptr = this->GetOutputPtr(&dx);
     std::string dx_name = this->GetOutputName(dx);
-    VLOG(6) << "Runing reshape2_grad composite func";
+    VLOG(6) << "Running reshape2_grad composite func";
     prim::reshape_grad<prim::DescTensor>(x, out_grad, dx_ptr);
     this->RecoverOutputName(dx, dx_name);
   }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 1842ed34a5c67..ceb087fce4cfb 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -222,7 +222,7 @@ class SplitCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
           "We don't support dynamic index or sections from tensor for split "
           "composite grad for now. "));
     } else {
-      VLOG(6) << "Runing split_grad composite func";
+      VLOG(6) << "Running split_grad composite func";
       prim::split_grad<prim::DescTensor>(out_grad, axis, dx_ptr);
       this->RecoverOutputName(input_grad, dx_name);
     }
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 718f4876406af..d8b7e35d6d3a1 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -127,7 +127,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput(
         "X",
-        "A Varaible list. The shape and data type of the list elements"
+        "A Variable list. The shape and data type of the list elements"
         "should be consistent. Variable can be multi-dimensional Tensor"
         "or phi::DenseTensor, and data types can be: float32, float64, int32, "
         "int64.")
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index caa31565d4cf3..273e2c7b65100 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -271,7 +271,7 @@ struct DiagAndFillFunctor {
 
 template <typename DeviceContext, typename T, typename ValueType = T>
 struct DeviceIndependenceTensorOperations {
-  // 1. Device indenpendence, for kernel reuse.
+  // 1. Device independence, for kernel reuse.
   // 2. Input and output is always tensor type.
   // 3. output phi::DenseTensor is alway allocated
   // 4. Basic phi::DenseTensor operator is supported
@@ -315,7 +315,7 @@ struct DeviceIndependenceTensorOperations {
   }
 
   phi::DenseTensor Transpose(const phi::DenseTensor& x) {
-    // transpose the last two dimision
+    // transpose the last two dimension
     phi::DenseTensor ret;
     auto x_dim = x.dims();
     auto x_vec = common::vectorize<int>(x_dim);
@@ -745,7 +745,7 @@ struct DeviceIndependenceTensorOperations {
       const framework::AttributeMap& attrs,
       std::vector<int> out_shape,
       NameOutTensor out_str = {"Out"}) {
-    // varialble set dims must be phi::DenseTensor / SelectedRowTensor
+    // variable set dims must be phi::DenseTensor / SelectedRowTensor
     framework::Scope& local_scope = context.scope().NewScope();
     framework::VariableNameMap op_outputs;
     for (auto out_name : out_str) {
@@ -753,7 +753,7 @@ struct DeviceIndependenceTensorOperations {
       op_outputs[out_name].emplace_back("tmp_" + out_name);
     }
     auto out_var = local_scope.Var("tmp_Out");  // return the Out
-    // create Out phi::DenseTensor and allocat memory
+    // create Out phi::DenseTensor and allocate memory
     out_var->GetMutable<phi::DenseTensor>()->mutable_data<T>(
         common::make_ddim(out_shape), context.GetPlace());
     // common::make_ddim(out_shape)
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index ec5587c330fc7..52f86d633307b 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -214,9 +214,9 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
         label_vec[i * sample_res_length + offset] = 0;
         mask_vec[i * sample_res_length + offset] = 1;
         VLOG(3) << "TDM: node id: " << travel_data[start_offset + layer_idx]
-                << " Res append negitive "
+                << " Res append negative "
                 << output_vec[i * sample_res_length + offset]
-                << " Label append negitive "
+                << " Label append negative "
                 << label_vec[i * sample_res_length + offset]
                 << " Mask append value "
                 << mask_vec[i * sample_res_length + offset];
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index ad54a49f820f9..332008894d5b9 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -173,7 +173,7 @@ class TeacherStudentSigmoidLossGradientOp
           platform::errors::InvalidArgument(
               "When Attr(soft_label) == false, the 2nd dimension of "
               "Input(Label) should be 1. But received Input(Label)'s 2nd "
-              "dimemsion "
+              "dimension "
               "is [%d]",
               label_dims[1]));
     }
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index 26657ce42f303..9d961bbd57122 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -185,7 +185,7 @@ class TileCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
           "We don't support RepeatTimes from tensor or repeat_times_tensor for "
           "tile composite grad for now. "));
     } else {
-      VLOG(6) << "Runing tile_grad composite func";
+      VLOG(6) << "Running tile_grad composite func";
       prim::tile_grad<prim::DescTensor>(
           x, out_grad, paddle::experimental::IntArray(repeat_times), dx_ptr);
       this->RecoverOutputName(x_grad, dx_name);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index f8fa53e2ad505..b0d30f1d22d3b 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -46,7 +46,7 @@ class TopkKernel : public framework::OpKernel<T> {
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
-    // reshape input to a flattern matrix(like flat_inner_dims)
+    // reshape input to a flatten matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row =
         common::product(common::slice_ddim(inputdims, 0, inputdims.size() - 1));
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index 55d3fa8624a8c..fff713236e9a6 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -60,7 +60,7 @@ class TopkXPUKernel : public framework::OpKernel<T> {
     int* indices_int_data = RAII_GUARD.alloc_l3_or_gm<int>(indices->numel());
     PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int_data);
 
-    // reshape input to a flattern matrix(like flat_inner_dims)
+    // reshape input to a flatten matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row =
         common::product(common::slice_ddim(inputdims, 0, inputdims.size() - 1));
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 52633640fa95b..2736171626121 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -110,7 +110,7 @@ class TransferLayoutFunctor {
         }
         VLOG(4) << "TransDataLayoutFromOneDNN: " << in_layout << "->"
                 << target_layout;
-        // Case2 - transfrom from ONEDNN OPKernel to Non-ONEDNN OPKernel
+        // Case2 - transform from ONEDNN OPKernel to Non-ONEDNN OPKernel
         // Do transform via ONEDNN lib
         phi::funcs::TransDataLayoutFromOneDNN(in_layout,
                                               target_layout,
@@ -119,11 +119,11 @@ class TransferLayoutFunctor {
                                               dev_ctx_.GetPlace());
       }
     } else {
-      // Case3 - transfrom between Non-ONEDNN OPKernels
+      // Case3 - transform between Non-ONEDNN OPKernels
       TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
     }
 #else
-    // Case3 - transfrom between Non-ONEDNN OPKernels
+    // Case3 - transform between Non-ONEDNN OPKernels
     TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
 #endif
     framework::SetTensorToVariable(*in_, out_tensor, out_);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 417299d24db07..340728a1b8d1e 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -202,7 +202,7 @@ class Transpose2CompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     std::string dx_name = this->GetOutputName(dx);
     std::vector<int> axis =
         static_cast<std::vector<int>>(this->Attr<std::vector<int>>("axis"));
-    VLOG(6) << "Runing transpose2_grad composite func";
+    VLOG(6) << "Running transpose2_grad composite func";
     prim::transpose_grad<prim::DescTensor>(out_grad, axis, dx_ptr);
     this->RecoverOutputName(dx, dx_name);
   }
diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
index 0dd5d6fd4115c..d471b5277e029 100644
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -72,7 +72,7 @@ class CompositeGradOpMakerBase {
   virtual ~CompositeGradOpMakerBase() = default;
 
   virtual std::vector<std::unique_ptr<framework::OpDesc>> operator()() {
-    VLOG(3) << "Runing Composite Grad func for " << fwd_op_.Type() << "_grad ";
+    VLOG(3) << "Running Composite Grad func for " << fwd_op_.Type() << "_grad ";
     this->Apply();
     std::vector<std::unique_ptr<framework::OpDesc>> ops;
     // TODO(jiabin): Support multiple blocks later

From 97eb5ac589bda9af1f8db548e58bf4b3f4f4e5c1 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:26:07 +0800
Subject: [PATCH 21/55] Update random_routing_op.cc (#62182)

---
 paddle/fluid/operators/random_routing_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/random_routing_op.cc b/paddle/fluid/operators/random_routing_op.cc
index 9eaa3a664877c..dffcc9c361a66 100644
--- a/paddle/fluid/operators/random_routing_op.cc
+++ b/paddle/fluid/operators/random_routing_op.cc
@@ -22,7 +22,7 @@ class RandomRoutingOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Prob"), "Input", "Porb", "RandomRouting");
+    OP_INOUT_CHECK(ctx->HasInput("Prob"), "Input", "Prob", "RandomRouting");
     OP_INOUT_CHECK(
         ctx->HasInput("TopK_Value"), "Input", "TopKValue", "RandomRouting");
     OP_INOUT_CHECK(

From 108684db5854899ba67ebf3486bae44bc2fbf056 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:26:41 +0800
Subject: [PATCH 22/55]  Fix MaxSeqenceLenOp MaxSequenceLenOp, etc (#62181)

---
 paddle/fluid/operators/im2sequence_op.h       | 16 +++++++-------
 paddle/fluid/operators/is_empty_op.h          |  2 +-
 paddle/fluid/operators/isfinite_op.cc         |  2 +-
 paddle/fluid/operators/linear_chain_crf_op.cc |  4 ++--
 paddle/fluid/operators/linear_chain_crf_op.h  |  8 +++----
 paddle/fluid/operators/load_combine_op.h      |  2 +-
 paddle/fluid/operators/load_op.cc             |  2 +-
 paddle/fluid/operators/max_sequence_len_op.cc | 22 +++++++++----------
 paddle/fluid/operators/nce_op.cc              |  8 +++----
 paddle/fluid/operators/nce_op.h               |  4 ++--
 paddle/fluid/operators/pad_op.cc              |  2 +-
 .../operators/pull_box_extended_sparse_op.h   |  2 +-
 12 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 18e6d429f1b16..5fb689d5b1be0 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -48,13 +48,13 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     auto strides = ctx.Attr<std::vector<int>>("strides");
     auto paddings = ctx.Attr<std::vector<int>>("paddings");
     if (ctx.HasInput("Y") && batch_size > 1) {
-      const phi::DenseTensor* imgrealsize = ctx.Input<phi::DenseTensor>("Y");
+      const phi::DenseTensor* img_real_size = ctx.Input<phi::DenseTensor>("Y");
       auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
       phi::DenseTensor cpu_shape_tensor;
       paddle::framework::TensorCopySync(
-          *imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
-      std::vector<int> imgreal_h;
-      std::vector<int> imgreal_w;
+          *img_real_size, platform::CPUPlace(), &cpu_shape_tensor);
+      std::vector<int> img_real_h;
+      std::vector<int> img_real_w;
       std::vector<int> output_height;
       std::vector<int> output_width;
       int result = 0;
@@ -72,12 +72,12 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
         } else {
           tmp_real_w = tmp_real_w / out_stride[1] + 1;
         }
-        imgreal_h.push_back(tmp_real_h);
-        imgreal_w.push_back(tmp_real_w);
+        img_real_h.push_back(tmp_real_h);
+        img_real_w.push_back(tmp_real_w);
         output_height.push_back(Im2SeqOutputSize(
-            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+            img_real_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
         output_width.push_back(Im2SeqOutputSize(
-            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+            img_real_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
         result += output_height[i] * output_width[i];
       }
 
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
index 3c9dfbf58fae5..7c78c33621314 100644
--- a/paddle/fluid/operators/is_empty_op.h
+++ b/paddle/fluid/operators/is_empty_op.h
@@ -29,7 +29,7 @@ class IsEmptyOpKernel : public framework::OpKernel<T> {
     auto* output_tensor = context.Output<phi::DenseTensor>("Out");
 
     // Note: is_empty is always executed on CPU and the output data should
-    // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
+    // always be allocated for CPUPlace. We register CUDA kernel for this op to
     // avoid the unnecessary data transform.
     output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
         common::product(input_tensor->dims()) == 0;
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 0d80a1c36b071..710cdaeb707b6 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -86,7 +86,7 @@ If any X contains Inf or Nan, the Out will generate a indicator.
 Out = Inf if any X contains Inf,
 Out = Nan if any X contains Nan,
 Out = 0 if no Inf/Nan detected.
-If X contains both Inf/Nan, it will return the first indicator it meeted.
+If X contains both Inf/Nan, it will return the first indicator it met.
 
 %s
 )DOC",
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 46ff4c2e94a94..e017e43d7db2d 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -55,7 +55,7 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         "probabilities of all possible unfinished sequences of tags that end "
         "at position $k$ with tag $v$. For each $k$, "
         "$\alpha[k, v]$ is a vector of length $D$ with a component for "
-        "each tag value $v$. This vector is called a forward vecotr and "
+        "each tag value $v$. This vector is called a forward vector and "
         "will also be used in backward computations.")
         .AsIntermediate();
     AddOutput(
@@ -105,7 +105,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 weights, denoted as $a$ here.
 3. The next D values of Input(Transition) of this operator are for ending
 weights, denoted as $b$ here.
-4. The remaning values of Input(Transition) are for transition weights,
+4. The remaining values of Input(Transition) are for transition weights,
 denoted as $w$ here.
 5. Denote Input(Label) as $s$ here.
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index ad2fbefdfd71f..2891320506391 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -234,7 +234,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)),
         tag_num,
         platform::errors::InvalidArgument(
-            "An invalid tag label that execesses the largest tag number."));
+            "An invalid tag label that excesses the largest tag number."));
 
     // Calculate the nominator part, which depends on the label sequence.
     ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
@@ -308,7 +308,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // Now, all the inputs and outputs should be on the CPU memory.
     auto emission_dims = emission_exps->dims();
     // Beta is the memo table used in dynamic programming to calculate the
-    // backwark vectors. For a backward vector i (the i-th row of beta), it
+    // backward vectors. For a backward vector i (the i-th row of beta), it
     // captures the unnormalized probabilities of partial sequences starting
     // at position i.
     phi::DenseTensor beta;
@@ -372,7 +372,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     const size_t state_trans_base_idx = 2;
 
     // Calculate the backward vectors: beta.
-    // First, calculate the initialition state.
+    // First, calculate the initial state.
     for (size_t i = 0; i < tag_num; ++i) {
       beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
     }
@@ -411,7 +411,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       T* trans_grad = transition_grad->data<T>();
       for (size_t k = 0; k < tag_num; ++k) {
         // Do not multiply by the output gradient here, because x_grad_mat has
-        // alrealy done this.
+        // already done this.
         trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
         trans_grad[tag_num + k] +=
             x_grad_mat(/*to end state*/ seq_length - 1, k);
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
index 9f15523ce0129..4641c39111fad 100644
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -101,7 +101,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
           framework::NFD(it->first, &tmp);
           if (tmp.empty()) {
             VLOG(0) << "The string " << it->first
-                    << " was converted to unicode failedly! "
+                    << " was converted to unicode unsuccessfully! "
                     << "Then dropped to load it.";
             continue;
           }
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index dd85ccff87f2d..326746eb1e286 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -47,7 +47,7 @@ void LoadKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_GE(seek,
                       0,
                       phi::errors::InvalidArgument(
-                          "seek witn tensor must great than or equal to 0"));
+                          "seek with tensor must great than or equal to 0"));
     framework::DeserializeFromStream(fin, out, dev_ctx, seek, shape);
   } else {
     framework::DeserializeFromStream(fin, out, dev_ctx);
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index 813b1901760b9..1863787db3d3b 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -31,12 +31,12 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-class MaxSeqenceLenOp : public framework::OperatorBase {
+class MaxSequenceLenOp : public framework::OperatorBase {
  public:
-  MaxSeqenceLenOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
+  MaxSequenceLenOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
  private:
@@ -50,7 +50,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
   }
 };
 
-class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+class MaxSequenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("RankTable", "Input variable which is a LoDRankTable object");
@@ -65,11 +65,11 @@ class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+class MaxSequenceLenInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
     OP_INOUT_CHECK(
-        context->HasInput("RankTable"), "Input", "RankTable", "MaxSeqenceLen");
+        context->HasInput("RankTable"), "Input", "RankTable", "MaxSequenceLen");
     context->SetOutputDim("Out", {1});
   }
 };
@@ -78,8 +78,8 @@ class MaxSeqenceLenInferShape : public framework::InferShapeBase {
 
 REGISTER_OPERATOR(
     max_sequence_len,
-    paddle::operators::MaxSeqenceLenOp,
-    paddle::operators::MaxSeqenceLenOpProtoMaker,
-    paddle::operators::MaxSeqenceLenInferShape,
+    paddle::operators::MaxSequenceLenOp,
+    paddle::operators::MaxSequenceLenOpProtoMaker,
+    paddle::operators::MaxSequenceLenInferShape,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index f4320cd0b6796..1b622b7571667 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -149,19 +149,19 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddInput(
         "CustomDistProbs",
-        "(Tensor) It is used in 'CostumDist' sampler. "
+        "(Tensor) It is used in 'CustomDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
         "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
     AddInput(
         "CustomDistAlias",
-        "(Tensor) It is used in 'CostumDist' sampler. "
+        "(Tensor) It is used in 'CustomDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
         "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
     AddInput(
         "CustomDistAliasProbs",
-        "(Tensor) It is used in 'CostumDist' sampler. "
+        "(Tensor) It is used in 'CustomDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
         "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
@@ -194,7 +194,7 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(10);
     AddAttr<int>("sampler",
                  "(int) Which sampler to be used to sample negative class."
-                 "0: Uniform; 1: LogUniform; 2: CostumDist.")
+                 "0: Uniform; 1: LogUniform; 2: CustomDist.")
         .SetDefault(0);
     AddAttr<int>("seed",
                  "(int) The seed used in sampler. If it is 0, "
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index a21c7c816e191..41262dca6e53c 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -146,7 +146,7 @@ class NCEKernel : public framework::OpKernel<T> {
       default: {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Unsupported SamplerType. SamplerType should be 0: Uniform, "
-            "1: LogUniform or 2: CostumDist. Received SamplerType: %d",
+            "1: LogUniform or 2: CustomDist. Received SamplerType: %d",
             sampler_type));
       }
     }
@@ -332,7 +332,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       default: {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Unsupported SamplerType. SamplerType should be 0: Uniform, "
-            "1: LogUniform or 2: CostumDist. Received SamplerType: %d",
+            "1: LogUniform or 2: CustomDist. Received SamplerType: %d",
             sampler_type));
       }
     }
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index e2a0b3e025381..1a0f7b317d288 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -146,7 +146,7 @@ class PadCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     std::vector<int> paddings =
         static_cast<std::vector<int>>(this->Attr<std::vector<int>>("paddings"));
     float pad_value = static_cast<float>(this->Attr<float>("pad_value"));
-    VLOG(6) << "Runing add_grad composite func";
+    VLOG(6) << "Running add_grad composite func";
 
     prim::pad_grad<prim::DescTensor>(x, out_grad, paddings, pad_value, dx_ptr);
     this->RecoverOutputName(x_grad, dx_name);
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
index b9508a279505e..76e570f10fb64 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -86,7 +86,7 @@ static void PushBoxExtendedSparseFunctor(
                         cur_batch_size,
                         platform::errors::PreconditionNotMet(
                             "The batch size of all input slots should be same,"
-                            "please cheack"));
+                            "please check"));
     }
     const float *grad_value = d_output[i]->data<float>();
     const float *grad_value_extend = d_output_extend[i]->data<float>();

From 4fc1061358e7722c947e7e011bf5b9678899ee04 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:27:20 +0800
Subject: [PATCH 23/55] Fix nerual neural, etc (#62179)

---
 .../operators/common_infer_shape_functions.cc      |  4 ++--
 .../fluid/operators/deformable_psroi_pooling_op.cc |  2 +-
 paddle/fluid/operators/dgc_op.cc                   |  2 +-
 paddle/fluid/operators/dropout_op.cc               |  4 ++--
 paddle/fluid/operators/expand_op.cc                |  6 +++---
 paddle/fluid/operators/expand_op.h                 | 14 +++++++-------
 paddle/fluid/operators/expand_v2_op.h              | 10 +++++-----
 paddle/fluid/operators/fill_constant_op.cc         |  2 +-
 paddle/fluid/operators/fused_token_prune_op.cc     |  6 +++---
 paddle/fluid/operators/gru_unit_op.h               |  2 +-
 10 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index 52836ead345a1..1c13f873818f4 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -166,7 +166,7 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
                           "For binary broadcastable operator, if X is "
                           "Sparse(VarType.SELECTED_ROWS"
                           "), Y must be scalar, and the size of Y should be 1. "
-                          "But reveived the size of Y = %s.",
+                          "But received the size of Y = %s.",
                           y_dims.size()));
     PADDLE_ENFORCE_EQ(
         y_dims[0],
@@ -175,7 +175,7 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
             "For binary broadcastable operator, if X is "
             "Sparse(VarType.SELECTED_ROWS"
             "), Y must be scalar, the first dimension of Y should be 1. "
-            "But reveived the first dimension of Y = %s.",
+            "But received the first dimension of Y = %s.",
             y_dims[0]));
   } else if (ctx->GetInputsVarType(x_name).front() !=
              framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index 1e3e52d34e41c..5b339cf96c2b1 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -101,7 +101,7 @@ class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "The format is NCHW, where N is the number of ROIs, "
               "C is the number of output channels, "
               "H is the height of output, and "
-              "W is thewidth of output. ");
+              "W is the width of output. ");
     AddComment(R"DOC(
 **DeformablePSROIPooling Operator**
 DeformablePSROIPooling is a new method based Region of interest pooling
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
index 06fb2874f2171..7325c4271f9c4 100644
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -87,7 +87,7 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
 
     AddAttr<std::vector<float>>("sparsity",
-                                "(vecotr, float)"
+                                "(vector, float)"
                                 "The period sparsity of k_select.");
 
     AddAttr<float>("rampup_begin_step",
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 382a3f7ac920b..01df430f52161 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -108,7 +108,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Dropout Operator.
 
-Dropout refers to randomly dropping out units in a nerual network. It is a
+Dropout refers to randomly dropping out units in a neural network. It is a
 regularization technique for reducing overfitting by preventing neuron
 co-adaption during training. The dropout operator randomly set (according to
 the given dropout probability) the outputs of some units to zero, while others
@@ -175,7 +175,7 @@ class DropoutCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     auto mode = this->Attr<std::string>("dropout_implementation");
     prim::dropout_grad<prim::DescTensor>(
         mask, out_grad, p, is_test, mode, x_grad_p);
-    VLOG(3) << "Runing dropout_grad composite func";
+    VLOG(3) << "Running dropout_grad composite func";
     this->RecoverOutputName(x_grad, x_grad_name);
   }
 };
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 4c2dd99265781..71295296218f0 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -106,7 +106,7 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
              "expand_times_tensor and expand_times.")
         .AsDispensable();
     AddInput("expand_times_tensor",
-             "(Tensor Tensor<int>), epxand times for X."
+             "(Tensor Tensor<int>), expand times for X."
              "It has a higher priority than expand_times, but a lower priority "
              "than ExpandTimes")
         .AsDuplicable()
@@ -165,7 +165,7 @@ class ExpandGradOp : public framework::OperatorWithKernel {
           out_dims[0],
           platform::errors::InvalidArgument(
               "The first dimension size (%d) of Input(Out@GRAD) should be "
-              "equal to the crroresponding dimension size (%d) of Input(X)",
+              "equal to the corresponding dimension size (%d) of Input(X)",
               out_dims[0],
               x_dims[0]));
       start_pos = 1u;
@@ -181,7 +181,7 @@ class ExpandGradOp : public framework::OperatorWithKernel {
               out_dims[i],
               platform::errors::InvalidArgument(
                   "The %uth dimension size (%d) of Input(Out@GRAD) should be "
-                  "equal to the multiplication of the crroresponding dimension "
+                  "equal to the multiplication of the corresponding dimension "
                   "sizes of Input(X) (%d) and expand_times (%d).",
                   i,
                   out_dims[i],
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 8ff69a537ff7f..ee100b3b48418 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -43,36 +43,36 @@ inline std::vector<int> get_expand_times(
       expand_data = cpu_expand_tensor.data<int>();
     }
 #endif
-    auto vec_epxand_times =
+    auto vec_expand_times =
         std::vector<int>(expand_data, expand_data + expand_tensor->numel());
-    return vec_epxand_times;
+    return vec_expand_times;
   }
 
   auto list_expand_times_tensor =
       ctx.MultiInput<phi::DenseTensor>("expand_times_tensor");
   if (list_expand_times_tensor.size() > 0) {
     // get tensor from
-    std::vector<int> vec_epxand_times;
+    std::vector<int> vec_expand_times;
     for (size_t i = 0; i < list_expand_times_tensor.size(); ++i) {
       auto tensor = list_expand_times_tensor[i];
       if (platform::is_gpu_place(tensor->place())) {
         phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_times.push_back(*temp.data<int32_t>());
+        vec_expand_times.push_back(*temp.data<int32_t>());
       }
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
         phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_times.push_back(*temp.data<int32_t>());
+        vec_expand_times.push_back(*temp.data<int32_t>());
       }
 #endif
       else {  // NOLINT
-        vec_epxand_times.push_back(*tensor->data<int32_t>());
+        vec_expand_times.push_back(*tensor->data<int32_t>());
       }
     }
 
-    return vec_epxand_times;
+    return vec_expand_times;
   } else {
     return ctx.Attr<std::vector<int>>("expand_times");
   }
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
index 474ae818617fa..0a70faddb7d58 100644
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -53,26 +53,26 @@ inline std::vector<int> get_expand_shape(
       ctx.MultiInput<phi::DenseTensor>("expand_shapes_tensor");
   if (list_expand_shapes_tensor.size() > 0) {
     // get tensor from
-    std::vector<int> vec_epxand_shape;
+    std::vector<int> vec_expand_shape;
     for (size_t i = 0; i < list_expand_shapes_tensor.size(); ++i) {
       auto tensor = list_expand_shapes_tensor[i];
       if (platform::is_gpu_place(tensor->place())) {
         phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_shape.push_back(*temp.data<int32_t>());
+        vec_expand_shape.push_back(*temp.data<int32_t>());
       }
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
         phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_shape.push_back(*temp.data<int32_t>());
+        vec_expand_shape.push_back(*temp.data<int32_t>());
       }
 #endif
       else {  // NOLINT
-        vec_epxand_shape.push_back(*tensor->data<int32_t>());
+        vec_expand_shape.push_back(*tensor->data<int32_t>());
       }
     }
-    return vec_epxand_shape;
+    return vec_expand_shape;
   } else {
     return ctx.Attr<std::vector<int>>("shape");
   }
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 1263d156ce220..8a27649af864b 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -152,7 +152,7 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                   "device")
         .SetDefault(false);
     AddAttr<int>("place_type",
-                 "(int, default -1) allow mamually setting place where the "
+                 "(int, default -1) allow manually setting place where the "
                  "variable should be hold. "
                  "-1: not set manually, determine the place by executor. "
                  "0: CPUPlace. "
diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc
index 021aa95b1fe2c..9fab5c8e7c48d 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cc
+++ b/paddle/fluid/operators/fused_token_prune_op.cc
@@ -39,7 +39,7 @@ class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
         "The input of fused_token_prune op, whose shape should be [bsz, "
         "num_head, "
         "max_seq_len, max_seq_len] and dtype should be float32/float64."
-        "Mask is corresponding to Attn's elemnts one by one. Elements of Attn "
+        "Mask is corresponding to Attn's elements one by one. Elements of Attn "
         "will be set to zero if their corresponding mask is smaller than 0."
         "This process happens before sorting X by attn.");
 
@@ -56,7 +56,7 @@ class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
               "slimmed_seq_len, C]."
               "The tokens of X will be sorted by Attn firstly and then the "
               "last (max_seq_len - slimmed_seq_len)"
-              "tokens will be deleted. SlimmedX is the remainning part of X. "
+              "tokens will be deleted. SlimmedX is the remaining part of X. "
               "");
 
     AddOutput(
@@ -82,7 +82,7 @@ class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
                 1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0.
                 2. The second dimension of X will be sorted by Attn.
                 3. The last (max_seq_len - slimmed_seq_len) lines of X will be pruned.
-                4. The remainning part of sorted X will output.
+                4. The remaining part of sorted X will output.
                 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 9309ca0417f62..933176433e2d7 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -105,7 +105,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               gate_data,
               frame_size * 3);
 
-    // calculate activited gate
+    // calculate activated gate
     Eigen::array<int, 2> extents{{batch_size, frame_size}};
     Eigen::array<int, 2> u_offsets{{0, 0}};
     ActCompute(context.Attr<int>("gate_activation"),

From 471c8fe657c61a4f242436a1cf43a3ec608970ea Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:36:07 +0800
Subject: [PATCH 24/55] Fix StrightThroughEstimatorGradOp
 StraightThroughEstimatorGradOp (#62178)

* Fix

* Fix
---
 paddle/fluid/operators/fake_quantize_op.cc | 34 +++++++++++-----------
 paddle/fluid/operators/fake_quantize_op.cu |  4 +--
 paddle/fluid/operators/fake_quantize_op.h  |  4 +--
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 0515a56d41d5b..a5169892187a2 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -825,7 +825,7 @@ And it will not quantize the input tensor.
   }
 };
 
-class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
+class StraightThroughEstimatorGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -835,11 +835,11 @@ class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput(out_grad_name),
                    "Input",
                    out_grad_name,
-                   "StrightThroughEstimatorGradOp");
+                   "StraightThroughEstimatorGradOp");
     OP_INOUT_CHECK(ctx->HasOutput(x_grad_name),
                    "Output",
                    x_grad_name,
-                   "StrightThroughEstimatorGradOp");
+                   "StraightThroughEstimatorGradOp");
 
     ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
   }
@@ -853,13 +853,13 @@ class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
+class StraightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("stright_throuth_estimator_grad");
+    grad_op->SetType("straight_through_estimator_grad");
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetAttrMap(this->Attrs());
@@ -888,8 +888,8 @@ REGISTER_OPERATOR(
     fake_quantize_dequantize_abs_max,
     ops::FakeQuantOrWithDequantAbsMaxOp,
     ops::FakeQuantOrWithDequantAbsMaxOpMaker,
-    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
-    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
+    ops::StraightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StraightThroughEstimatorMaker<paddle::imperative::OpBase>);
 PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_abs_max,
                           CPU,
                           ALL_LAYOUT,
@@ -924,8 +924,8 @@ REGISTER_OPERATOR(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
     ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
-    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
+    ops::StraightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StraightThroughEstimatorMaker<paddle::imperative::OpBase>);
 PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_moving_average_abs_max,
                           CPU,
                           ALL_LAYOUT,
@@ -948,28 +948,28 @@ REGISTER_OPERATOR(
     moving_average_abs_max_scale,
     ops::MovingAverageAbsMaxScaleOp,
     ops::MovingAverageAbsMaxScaleOpMaker,
-    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
-    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
+    ops::StraightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StraightThroughEstimatorMaker<paddle::imperative::OpBase>);
 PD_REGISTER_STRUCT_KERNEL(moving_average_abs_max_scale,
                           CPU,
                           ALL_LAYOUT,
                           ops::MovingAverageAbsMaxScaleKernel,
                           float) {}
 
-REGISTER_OPERATOR(stright_throuth_estimator_grad,
-                  ops::StrightThroughEstimatorGradOp);
-PD_REGISTER_STRUCT_KERNEL(stright_throuth_estimator_grad,
+REGISTER_OPERATOR(straight_through_estimator_grad,
+                  ops::StraightThroughEstimatorGradOp);
+PD_REGISTER_STRUCT_KERNEL(straight_through_estimator_grad,
                           CPU,
                           ALL_LAYOUT,
-                          ops::StrightThroughEstimatorGradKernel,
+                          ops::StraightThroughEstimatorGradKernel,
                           float) {}
 
 REGISTER_OPERATOR(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
-    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
-    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
+    ops::StraightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StraightThroughEstimatorMaker<paddle::imperative::OpBase>);
 PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_quantize_dequantize_abs_max,
                           CPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index bf990a451eb2d..68ceaca46d04f 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -60,10 +60,10 @@ PD_REGISTER_STRUCT_KERNEL(fake_quantize_dequantize_moving_average_abs_max,
                           ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel,
                           float,
                           float16) {}
-PD_REGISTER_STRUCT_KERNEL(stright_throuth_estimator_grad,
+PD_REGISTER_STRUCT_KERNEL(straight_through_estimator_grad,
                           GPU,
                           ALL_LAYOUT,
-                          ops::StrightThroughEstimatorGradKernel,
+                          ops::StraightThroughEstimatorGradKernel,
                           float,
                           float16) {}
 PD_REGISTER_STRUCT_KERNEL(fake_channel_wise_quantize_dequantize_abs_max,
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index dd8675331fce6..6387018d1865e 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -446,7 +446,7 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 };
 
 template <typename T, typename DeviceContext>
-class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
+class StraightThroughEstimatorGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *d_out =
@@ -455,7 +455,7 @@ class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
     auto *d_x = context.Output<phi::DenseTensor>(x_grad_name);
     PADDLE_ENFORCE_NOT_NULL(d_x,
                             platform::errors::PreconditionNotMet(
-                                "StrightThroughEstimatorGradKernel "
+                                "StraightThroughEstimatorGradKernel "
                                 "doesn't have the output named %s.",
                                 x_grad_name));
 

From cc1a2314e4754ff2f6e7303b422f3f2f1b2c28e7 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:36:51 +0800
Subject: [PATCH 25/55] Fix summuation summation, etc(#62172)

---
 paddle/fluid/operators/cross_entropy_op.cc    |  6 ++---
 paddle/fluid/operators/cross_entropy_op.h     |  6 ++---
 paddle/fluid/operators/cudnn_lstm_op.cc       |  2 +-
 .../custom_device_common_op_registry.cc       | 12 +++++-----
 paddle/fluid/operators/data_norm_op.cc        | 22 +++++++++----------
 5 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 3a90012e1763a..cc2b4b4252835 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -239,7 +239,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
               "represents the cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
-                  "interpretant the given labels as soft labels.")
+                  "interpret the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default -100), Specifies a target value that is"
@@ -268,10 +268,10 @@ computation.
 
                 $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$
 
-   Please make sure that in this case the summuation of each row of Label
+   Please make sure that in this case the summation of each row of Label
    equals one.
 
-3) One-hot cross-entropy with vecterized Input(Label):
+3) One-hot cross-entropy with vectorized Input(Label):
      As a special case of 2), when each row of Input(Label) has only one
      non-zero element (equals 1), soft-label cross-entropy degenerates to a
      one-hot cross-entropy with one-hot label representation.
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index d755cb1639572..5b76cc9a65a2b 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -62,9 +62,9 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class XeSoftlabelGradFunctor {
+class XeSoftLabelGradFunctor {
  public:
-  XeSoftlabelGradFunctor(T* dx,
+  XeSoftLabelGradFunctor(T* dx,
                          const T* dy,     // NOLINT
                          const T* x,      // NOLINT
                          const T* label,  // NOLINT
@@ -137,7 +137,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
     int64_t class_num = x->dims()[rank - 1];
     int64_t ignore_index = ctx.Attr<int>("ignore_index");
     if (ctx.Attr<bool>("soft_label")) {
-      XeSoftlabelGradFunctor<T> functor(dx_data,
+      XeSoftLabelGradFunctor<T> functor(dx_data,
                                         dy->data<T>(),
                                         x->data<T>(),
                                         label->data<T>(),
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index e61512924f81d..a082dbbcb8bcb 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -152,7 +152,7 @@ the cell input ct-1 and the previous layer input xt given matrices W, R and bias
   which is computed based on the current input and the previous hidden state.
 
 Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
-X represensts a matrix multiplication
+X represents a matrix multiplication
 
 
 )DOC");
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index 9573809d6c7cc..950b7f0663658 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -465,10 +465,10 @@ class CSoftmaxWithCrossEntropyGradCustomDeviceKernel
       framework::TensorCopy(
           *softmax, context.GetPlace(), context.device_context(), logit_grad);
     }
-    const auto sofrmax_dims = softmax->dims();
-    const int axis = sofrmax_dims.size() - 1;
-    const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
-    const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
+    const auto softmax_dims = softmax->dims();
+    const int axis = softmax_dims.size() - 1;
+    const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
+    const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
     const auto& label_type = labels->dtype();
 
     if (label_type == phi::DataType::INT32 ||
@@ -514,7 +514,7 @@ class CSoftmaxWithCrossEntropyGradCustomDeviceKernel
       logit_grad
           ->ShareDataWith(*reinterpret_cast<phi::DenseTensor*>(
               logits_grad_out_tensor2.impl().get()))
-          .Resize(sofrmax_dims);
+          .Resize(softmax_dims);
     } else {
       PADDLE_THROW(phi::errors::Unavailable(
           "CustomDevice c_softmax_with_cross_entropy_grad "
@@ -853,7 +853,7 @@ class AssignPosCustomDeviceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // assign pos decides which tokens should be fetched belong to specially
-    // counter orderingly.
+    // counter orderly.
     auto cum_count = context.Input<phi::DenseTensor>(
         "cum_count");  // (counter number) int32 | int64
     auto numbers = context.Input<phi::DenseTensor>(
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 32cc8b49cd007..cc3a224a7e862 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -81,28 +81,28 @@ class DataNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(),
                       1UL,
                       platform::errors::InvalidArgument(
-                          "The input dim of BatchSize shouold be 1"));
+                          "The input dim of BatchSize should be 1"));
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(),
                       1UL,
                       platform::errors::InvalidArgument(
-                          "The input dim of BatchSum shouold be 1"));
+                          "The input dim of BatchSum should be 1"));
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(),
                       1UL,
                       platform::errors::InvalidArgument(
-                          "The input dim of BatchSquareSum shouold be 1"));
+                          "The input dim of BatchSquareSum should be 1"));
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0],
                         C,
                         platform::errors::InvalidArgument(
-                            "The input dim[0] of BatchSize shouold be C"));
+                            "The input dim[0] of BatchSize should be C"));
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0],
                         C,
                         platform::errors::InvalidArgument(
-                            "The input dim[0] of BatchSum shouold be C"));
+                            "The input dim[0] of BatchSum should be C"));
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0],
                         C,
                         platform::errors::InvalidArgument(
-                            "The input dim[0] of BatchSqureSum shouold be C"));
+                            "The input dim[0] of BatchSquareSum should be C"));
     }
 
     if (enable_scale_and_shift) {
@@ -112,10 +112,10 @@ class DataNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           scale_dim.size(),
           1UL,
-          platform::errors::InvalidArgument("the dimensionof scale"
+          platform::errors::InvalidArgument("the dimension of scale"
                                             "must equal to 1. But received: "
                                             "the shape of scale is [%s], "
-                                            "the dimensionof scale is [%d]",
+                                            "the dimension of scale is [%d]",
                                             scale_dim,
                                             scale_dim.size()));
       PADDLE_ENFORCE_EQ(
@@ -691,7 +691,7 @@ class DataNormGradKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
             }
           }
         } else {
-          // calculate data sum and squre sum
+          // calculate data sum and square sum
           Eigen::Array<T, Eigen::Dynamic, 1> sample_sum(C);
           Eigen::Array<T, Eigen::Dynamic, 1> sample_square_sum(C);
           // calculate data sample sum and square sum
@@ -769,7 +769,7 @@ PD_REGISTER_STRUCT_KERNEL(
 
 REGISTER_OP_VERSION(data_norm).AddCheckpoint(
     R"ROC(
-              upgrad data_norm op by adding scale_w to support scale and shift.)ROC",
+              upgrade data_norm op by adding scale_w to support scale and shift.)ROC",
     paddle::framework::compatible::OpVersionDesc().NewInput(
         "scale_w",
-        "scale_w is used to do scale duirng data_norm like batchnorm "));
+        "scale_w is used to do scale during data_norm like batchnorm "));

From f471aa136bdfc648707e99bb5e46c598761fe984 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:37:56 +0800
Subject: [PATCH 26/55] Fix checkponit checkpoint, etc (#62168)

---
 paddle/fluid/operators/activation_op.cc            | 10 +++++-----
 paddle/fluid/operators/activation_op.h             |  2 +-
 paddle/fluid/operators/assign_value_op.h           |  2 +-
 paddle/fluid/operators/attention_lstm_op.cc        |  2 +-
 paddle/fluid/operators/batch_norm_op.cc            |  6 +++---
 paddle/fluid/operators/beam_search_decode_op_def.h |  2 +-
 paddle/fluid/operators/chunk_eval_op.h             |  8 ++++----
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index b848697128731..ddfbda809c1df 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -94,7 +94,7 @@ class ActivationGradOpMaker : public framework::SingleGradOpMaker<T> {
 //     paddle::Tensor dx = this->GetSingleInputGrad("X");
 //     auto* dx_ptr = this->GetOutputPtr(&dx);
 //     std::string dx_name = this->GetOutputName(dx);
-//     VLOG(6) << "Runing hardswish_grad composite func";
+//     VLOG(6) << "Running hardswish_grad composite func";
 //     prim::hardswish_grad<prim::DescTensor>(x, out_grad, dx_ptr);
 //     this->RecoverOutputName(dx, dx_name);
 //   }
@@ -394,19 +394,19 @@ REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor);
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
     .AddCheckpoint(
-        R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
+        R"ROC(fix leaky_relu, behavior changed when alpha < 0 or alpha > 1)ROC",
         paddle::framework::compatible::OpVersionDesc()
             .BugfixWithBehaviorChanged(
-                "leaky_relu calculate formula before checkponit: out = max(x, "
+                "leaky_relu calculate formula before checkpoint: out = max(x, "
                 "alpha * x); after checkpoint: out = x if x > 0 else alpha * "
                 "x"));
 
 REGISTER_OP_VERSION(hard_shrink)
     .AddCheckpoint(
-        R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
+        R"ROC(fix hard_shrink, behavior changed when threshold<0)ROC",
         paddle::framework::compatible::OpVersionDesc()
             .BugfixWithBehaviorChanged(
-                "hard_shrink calculate formula before checkponit: out = x * "
+                "hard_shrink calculate formula before checkpoint: out = x * "
                 "((x < -threshold) + (x > threshold)); after checkpoint: out = "
                 "x * (((x < -threshold) + (x > threshold)) > 0)"));
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 8280c817b706a..38432f8768f59 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -371,7 +371,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
 // DOut(dy) as input(not output), tensor extraction is different from
-// others. Impliment extraction kernel separately here.
+// others. Implement extraction kernel separately here.
 inline void ExtractDoubleGradTensorWithInputDOut(
     const framework::ExecutionContext& ctx,
     const phi::DenseTensor** X,
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index 2a6a31ba03004..5ba8b9367e64e 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -29,7 +29,7 @@ typename std::enable_if<std::is_same<T, bool>::value>::type CopyVectorToTensor(
     const char* value_name,
     phi::DenseTensor* out,
     const framework::ExecutionContext& ctx) {
-  // phi::DenseTensore dtype is vector<bool>, it will be converted to
+  // phi::DenseTensor dtype is vector<bool>, it will be converted to
   //  vector<int>.
   //  at the same time, we can not use vector<bool> to hold the value, because
   //  the c++ use bit value to replace byte value.
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 9624f752b780f..6a0775e6331a7 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -488,7 +488,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
 
         // gate act: sigmoid
         act_gate(D3, lstm_out_data, lstm_out_data);
-        // candicate act: tanh
+        // candidate act: tanh
         act_cand(D, lstm_out_data + D3, lstm_out_data + D3);
 
         // a = forget * prev_cell
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index fd05b018bbfb6..996c6af070631 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -308,11 +308,11 @@ void BatchNormOpMaker::Make() {
                 "to true or is_test true. the behavior is equivalent. "
                 "In train mode, when setting use_global_stats True, the "
                 "global mean and variance are also used during train time, "
-                "the BN acts as scaling and shiffting.")
+                "the BN acts as scaling and shifting.")
       .SetDefault(false);
   AddAttr<bool>("trainable_statistics",
                 "(bool, default false) Whether to calculate mean and variance "
-                "in test mode. If setting true in test mode, mean and variace "
+                "in test mode. If setting true in test mode, mean and variance "
                 "will be calculated by current batch statistics.")
       .SetDefault(false);
   AddComment(R"DOC(
@@ -586,7 +586,7 @@ class BatchNormCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     auto use_global_stats = this->Attr<bool>("use_global_stats");
     auto trainable_statistics = this->Attr<bool>("trainable_statistics");
 
-    VLOG(3) << "Runing batch_norm composite func";
+    VLOG(3) << "Running batch_norm composite func";
     prim::batch_norm_grad<prim::DescTensor>(x,
                                             scale,
                                             bias,
diff --git a/paddle/fluid/operators/beam_search_decode_op_def.h b/paddle/fluid/operators/beam_search_decode_op_def.h
index 390f728322322..d358d8255fcf3 100644
--- a/paddle/fluid/operators/beam_search_decode_op_def.h
+++ b/paddle/fluid/operators/beam_search_decode_op_def.h
@@ -27,7 +27,7 @@ using LoDTensorArray = framework::LoDTensorArray;
 
 // all the lod have 2 levels.
 // The first is source level, the second is sentence level.
-// source level describe how many prefixes (branchs) for each source sentence
+// source level describe how many prefixes (branches) for each source sentence
 // (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
 const size_t kSentenceLevel = 1;
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index 22b3accba8639..baad8719db37f 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -199,7 +199,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     const int64_t* inference_data = inference->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
     T* precision_data = precision->mutable_data<T>(place);
-    T* racall_data = recall->mutable_data<T>(place);
+    T* recall_data = recall->mutable_data<T>(place);
     T* f1_data = f1->mutable_data<T>(place);
     int64_t* num_infer_chunks_data =
         num_infer_chunks->mutable_data<int64_t>(place);
@@ -280,14 +280,14 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
                           ? 0
                           : static_cast<T>(*num_correct_chunks_data) /
                                 (*num_infer_chunks_data);
-    *racall_data = !(*num_label_chunks_data)
+    *recall_data = !(*num_label_chunks_data)
                        ? 0
                        : static_cast<T>(*num_correct_chunks_data) /
                              (*num_label_chunks_data);
     *f1_data = !(*num_correct_chunks_data)
                    ? 0
-                   : 2 * (*precision_data) * (*racall_data) /
-                         ((*precision_data) + (*racall_data));
+                   : 2 * (*precision_data) * (*recall_data) /
+                         ((*precision_data) + (*recall_data));
   }
 
   void EvalOneSeq(const int64_t* output,

From eee170a56f00db78c1fcc049798996fa75d5c2a7 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:38:28 +0800
Subject: [PATCH 27/55]  Fix cadidate candidate, etc (#62163)

---
 paddle/cinn/backends/codegen_c_test.cc         |  6 +++---
 paddle/cinn/ir/schedule/impl/base.cc           |  2 +-
 .../cinn/ir/schedule/impl/compute_location.cc  |  4 ++--
 paddle/cinn/ir/schedule/ir_schedule_error.cc   |  2 +-
 paddle/cinn/ir/schedule/ir_schedule_util.cc    |  8 ++++----
 paddle/cinn/ir/schedule/schedule_desc.cc       | 12 ++++++------
 paddle/cinn/ir/test/tensor_test.cc             |  2 +-
 paddle/cinn/lang/lower_impl.h                  |  6 +++---
 paddle/cinn/optim/insert_debug_log_callee.cc   |  2 +-
 paddle/cinn/optim/unroll_loops.cc              |  2 +-
 .../runtime/cuda/cuda_intrinsics_reduce.cc     | 18 +++++++++---------
 paddle/cinn/runtime/cuda/cuda_util.cc          |  4 ++--
 12 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc
index 91f80c190f0f8..61adad6ade461 100644
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
@@ -61,9 +61,9 @@ TEST(CodeGenC, module) {
   LOG(INFO) << "C.body: " << C->get_compute_op()->body.front();
 
   Target target;
-  target.arch = Target::Arch ::X86;
-  target.bits = Target::Bit ::k32;
-  target.os = Target::OS ::Linux;
+  target.arch = Target::Arch::X86;
+  target.bits = Target::Bit::k32;
+  target.os = Target::OS::Linux;
   Module::Builder builder("module1", target);
 
   ast_gen_ius::TensorGroup tensor_group({A, B, C});
diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc
index d27bcd451f508..61632dcf2452e 100644
--- a/paddle/cinn/ir/schedule/impl/base.cc
+++ b/paddle/cinn/ir/schedule/impl/base.cc
@@ -428,7 +428,7 @@ Expr DyScheduleImpl::SampleCategorical(
   std::string primitive = "SampleCategorical";
   std::ostringstream os;
   if (candidates.size() != probs.size()) {
-    os << "vector<int> params(candidates) and vector<int> prama(probs) must "
+    os << "vector<int> params(candidates) and vector<int> params(probs) must "
           "have same size in SampleCategorical!\n";
     throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
   }
diff --git a/paddle/cinn/ir/schedule/impl/compute_location.cc b/paddle/cinn/ir/schedule/impl/compute_location.cc
index a077039994e81..585257899968f 100644
--- a/paddle/cinn/ir/schedule/impl/compute_location.cc
+++ b/paddle/cinn/ir/schedule/impl/compute_location.cc
@@ -42,11 +42,11 @@ void DyScheduleImpl::ComputeAt(const Expr& block,
   std::string primitive = "ComputeAt";
   std::ostringstream os;
   if (!block.As<ir::ScheduleBlockRealize>()) {
-    os << "Expr prama(block) should be a ScheduleBlockRealize!\n";
+    os << "Expr param(block) should be a ScheduleBlockRealize!\n";
     throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
   }
   if (!loop.As<ir::For>()) {
-    os << "Expr prama(loop) should be a For node!\n";
+    os << "Expr param(loop) should be a For node!\n";
     throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
   }
   Expr root = this->GetRootBlock(block);
diff --git a/paddle/cinn/ir/schedule/ir_schedule_error.cc b/paddle/cinn/ir/schedule/ir_schedule_error.cc
index 3467df28e5485..0b7a098264632 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_error.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_error.cc
@@ -21,7 +21,7 @@ namespace ir {
 
 std::string IRScheduleErrorHandler::GeneralErrorMessage() const {
   std::ostringstream os;
-  os << "[IRScheduleError] An error occurred in the scheduel primitive < "
+  os << "[IRScheduleError] An error occurred in the schedule primitive < "
      << this->primitive_ << " >. " << std::endl;
   os << indent_str_ << "[Error info] " << this->err_msg_;
   return os.str();
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index ba98382ebbf2f..739f17d06e80a 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -207,7 +207,7 @@ void ReplaceExpr(Expr* source,
                  const std::vector<Expr>& candidates) {
   CHECK_EQ(replaced.size(), candidates.size())
       << "In ReplaceExpr, the size of Vars to be replaced must be equal to the "
-         "size of cadidate Exprs! Please check.";
+         "size of candidate Exprs! Please check.";
   if (replaced.empty()) return;
   std::map<Var, Expr, CompVar> replacing_map;
   for (int i = 0; i < replaced.size(); ++i) {
@@ -764,7 +764,7 @@ Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
   //   }                                             }
   // }                                             }
   //
-  // We go throuph origin loop and check other body stmts, adding it as another
+  // We go through origin loop and check other body stmts, adding it as another
   // chain, such as:
   //
   // for (i, 0, 32) {
@@ -1022,7 +1022,7 @@ void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {  // NOLINT
     auto dst_it = dst_block->stmts.begin() + index;
     if (dst_it->As<IfThenElse>()) {
       auto* inserted_block = dst_it->As<IfThenElse>()->true_case.As<Block>();
-      CHECK(inserted_block) << "the IfThenElse node to be inserted shuold "
+      CHECK(inserted_block) << "the IfThenElse node to be inserted should "
                                "contain a true_case block";
       inserted_block->stmts.insert(inserted_block->stmts.begin(), insertion);
     } else {
@@ -1060,7 +1060,7 @@ std::vector<IterRange> CalculateRequiredRegions(
   }
 
   std::vector<IterRange> required_buffer_range;
-  // deduce accessed regions of the provided tensor in block by itering each
+  // deduce accessed regions of the provided tensor in block by iterating each
   // required block
   for (const Expr& pro_node : provided_nodes) {
     std::string provided_tensor_name =
diff --git a/paddle/cinn/ir/schedule/schedule_desc.cc b/paddle/cinn/ir/schedule/schedule_desc.cc
index c9a26dfa1643d..b29d89fdd1dc9 100644
--- a/paddle/cinn/ir/schedule/schedule_desc.cc
+++ b/paddle/cinn/ir/schedule/schedule_desc.cc
@@ -27,7 +27,7 @@
 namespace cinn {
 namespace ir {
 
-// ------ Following codes are about `Apply` functions registry of variaous types
+// ------ Following codes are about `Apply` functions registry of various types
 // of ScheduleDesc::Step
 class PackedStepContext;
 // uniformed function prototype of a scheduling operation in IRSchedule
@@ -118,7 +118,7 @@ class PackedStepContext {
       return absl::get<AttrType>(attrs_.at(idx));
     } catch (absl::bad_variant_access& ex) {
       LOG(FATAL) << "Attribute cast error, idx:" << idx
-                 << ", get tpye:" << typeid(AttrType).name()
+                 << ", get type:" << typeid(AttrType).name()
                  << ", real index:" << attrs_.at(idx).index();
       throw ex;
     }
@@ -197,7 +197,7 @@ struct FreeFuncConverter<Return (IRSchedule::*)(Args...) const, impl_fn> {
   }
 };
 
-// used for formatting scheduling functions with variaous function signatures to
+// used for formatting scheduling functions with various function signatures to
 // be uniformed form
 template <typename F, F f>
 struct ApplyFuncImpl;
@@ -689,8 +689,8 @@ proto::ScheduleDesc ScheduleDesc::ToProto() const {
       }
     }
 
-    // each output Expr is represented by a formatted name, to be refered by
-    // suceeding steps
+    // each output Expr is represented by a formatted name, to be referred by
+    // succeeding steps
     for (auto&& expr : step.outputs) {
       std::string local_name = "e" + std::to_string(expr2name.size());
       expr2name.emplace(expr, local_name);
@@ -722,7 +722,7 @@ std::vector<Expr> ScheduleDesc::ReplayWithProto(
   absl::flat_hash_map<std::string, Expr> name2expr;
   std::vector<Expr> last_outputs;
 
-  // resotre each scheduling step and apply to the new IRSchedule object
+  // restore each scheduling step and apply to the new IRSchedule object
   for (auto&& step_proto : desc_proto.steps()) {
     VLOG(4) << "Replay step:\n" << step_proto.DebugString();
     ScheduleDesc::Step step;
diff --git a/paddle/cinn/ir/test/tensor_test.cc b/paddle/cinn/ir/test/tensor_test.cc
index cea1263f2aba3..4bf64f309735e 100644
--- a/paddle/cinn/ir/test/tensor_test.cc
+++ b/paddle/cinn/ir/test/tensor_test.cc
@@ -144,7 +144,7 @@ TEST(Tensor, ReshapeCopied) {
 
   stages->InsertLazily(B);
 
-  ir::Module::Builder builder("some_modue", cinn::common::DefaultHostTarget());
+  ir::Module::Builder builder("some_module", cinn::common::DefaultHostTarget());
   auto func = lang::Lower("fn", stages, {A, B}, {}, {}, &builder);
 
   backends::CodeGenC codegenc(cinn::common::DefaultHostTarget());
diff --git a/paddle/cinn/lang/lower_impl.h b/paddle/cinn/lang/lower_impl.h
index b5f82ba7312e6..840fcfce860a0 100644
--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
@@ -150,8 +150,8 @@ class LowerImpl {
   std::vector<Tensor> CollectTemporaryTensors();
 
   /**
-   * \brief Check both the tensor_args and sclar_args not contain duplication
-   * (different arguemnt with the same name).
+   * \brief Check both the tensor_args and scalar_args not contain duplication
+   * (different argument with the same name).
    */
   void CheckArgsUnique();
 
@@ -304,7 +304,7 @@ struct MarkParallelMutator : public ir::IRMutator<Expr*> {
     auto it = parallels.find(tensor_n->name);
     if (it != parallels.end()) {
       for (int level : it->second) {
-        VLOG(1) << "Mark " << level << " Paralled";
+        VLOG(1) << "Mark " << level << " Parallelled";
         CHECK_LT(level, stack.size());
         stack[level]->set_parallel();
       }
diff --git a/paddle/cinn/optim/insert_debug_log_callee.cc b/paddle/cinn/optim/insert_debug_log_callee.cc
index fdab377bc88cc..1bcfd34bbaf9c 100644
--- a/paddle/cinn/optim/insert_debug_log_callee.cc
+++ b/paddle/cinn/optim/insert_debug_log_callee.cc
@@ -139,7 +139,7 @@ struct InsertDebugLogCalleeMutator : public ir::IRMutator<> {
     ir::IRMutator<>::Visit(&node->body, &node->body);
 
     auto deal_with_exprs =
-        [&](std::vector<Expr> *exprs) {  // deal with op->argument_preapre_exprs
+        [&](std::vector<Expr> *exprs) {  // deal with op->argument_prepare_exprs
           std::vector<Expr> new_stmts;
           for (auto &expr : *exprs) {
             auto msg =
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index 9f2e8bf244e4c..7fa5e3a8b8222 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -62,7 +62,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
   void Visit(const ir::For* op, Expr* expr) override {
     IRMutator<>::Visit(op, expr);
     if (op->extent.As<ir::IntImm>() == nullptr) {
-      VLOG(5) << "loop to be unrolled should have a contant extent";
+      VLOG(5) << "loop to be unrolled should have a constant extent";
       return;
     }
     int64_t extent = op->extent.as_int64();
diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
index 15fcb4030e89b..685c466f7f9c9 100644
--- a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
@@ -146,22 +146,22 @@ CINN_REGISTER_HELPER(cuda_intrinsics_reduce) {
 
 #undef REGISTER_BLOCK_REDUCE_FUNC_IMPL
 
-#define REGISTER_BLOCK_SHUFLLE_FUNC_IMPL(REDUCE_TYPE, DTYPE)              \
+#define REGISTER_BLOCK_SHUFFLE_FUNC_IMPL(REDUCE_TYPE, DTYPE)              \
   REGISTER_FACKED_EXTERN_FUNC_HELPER(block_shuffle_##REDUCE_TYPE, target) \
       .SetRetType<DTYPE>()                                                \
       .AddInputType<cinn_buffer_t *>()                                    \
       .AddInputType<int>()                                                \
       .End();
 
-  EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
-  EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
-  EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
-  EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
-  EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
-  EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
-  EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
+  EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
+  EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
+  EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
+  EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
+  EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
+  EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_BLOCK_SHUFFLE_FUNC_IMPL)
 
-#undef REGISTER_BLOCK_SHUFLLE_FUNC_IMPL
+#undef REGISTER_BLOCK_SHUFFLE_FUNC_IMPL
 
 #undef EXPAND_REDUCE_INT32_REGISTER_MARCO
 #undef EXPAND_REDUCE_INT64_REGISTER_MARCO
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
index 18c277339ddaf..074c35f1ce9f9 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -481,7 +481,7 @@ void cinn_call_batched_cublas(void *v_args,
     void *B = args[1 + g].operator cinn_buffer_t *()->memory;
     void *C = args[1 + num_gemm + g].operator cinn_buffer_t *()->memory;
 
-    // if opside is 1, exhange A,B.
+    // if opside is 1, exchange A,B.
     if (opside) {
       auto tmp = A;
       A = B;
@@ -703,7 +703,7 @@ std::string debug_cudnn_pool_mode(cudnnPoolingMode_t pool_mode) {
     case CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING:
       return "avg_include_padding";
     case CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING:
-      return "avg_exclulude_padding";
+      return "avg_exclude_padding";
     default:
       LOG(FATAL) << "Pool only support max and avg now!";
   }

From 2e3ea49e96823816af152e7480cf98b662c3b708 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:39:27 +0800
Subject: [PATCH 28/55] Fix with_mateclass with_metaclass, etc (#62162)

* Fix

* ci

* Fix
---
 python/paddle/amp/auto_cast.py                |  6 +--
 python/paddle/amp/debugging.py                |  4 +-
 python/paddle/autograd/py_layer.py            |  4 +-
 .../base/dygraph/tensor_patch_methods.py      |  8 ++--
 .../incubate/checkpoint/auto_checkpoint.py    |  4 +-
 python/paddle/base/layers/io.py               |  4 +-
 .../base/layers/layer_function_generator.py   |  4 +-
 python/paddle/base/reader.py                  |  4 +-
 python/paddle/hapi/model.py                   | 46 +++++++++----------
 .../incubate/asp/supported_layer_list.py      | 14 +++---
 python/paddle/incubate/asp/utils.py           | 38 +++++++--------
 python/paddle/incubate/autograd/primapi.py    |  8 ++--
 python/paddle/incubate/autotune.py            |  8 ++--
 .../distribute_transpiler/__init__.py         |  6 +--
 .../transformers/decorator_transformer.py     | 20 ++++----
 .../transformers/tensorhook_transformer.py    |  4 +-
 python/paddle/jit/dy2static/utils.py          | 10 ++--
 python/paddle/jit/sot/symbolic/export.py      | 10 ++--
 python/paddle/tensor/math.py                  |  2 +-
 .../utils/cpp_extension/cpp_extension.py      |  6 +--
 20 files changed, 106 insertions(+), 104 deletions(-)

diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 0286a668d10f5..5a271171e09ce 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -53,7 +53,7 @@ def __init__(self):
         self.model_parameters = []
         self.use_master_grad = False
         self.already_register_final_backward_hook = False
-        self.already_classify_params_meshs = False  # For dist
+        self.already_classify_params_meshes = False  # For dist
         self.mesh2params = {}  # For dist
         self.amp_dtype = 'float32'
 
@@ -471,7 +471,7 @@ def master_grad_hook():
                 # NOTE(lizhiyu): To support semi-auto of dygraph mode, we must
                 # classify the params of model into different calsses according to their process_mesh.
                 # Otherwise, fault will occur.
-                if not amp_global_state().already_classify_params_meshs:
+                if not amp_global_state().already_classify_params_meshes:
                     for param in amp_global_state().model_parameters:
                         if param is not None and param.process_mesh is not None:
                             if (
@@ -485,7 +485,7 @@ def master_grad_hook():
                                 amp_global_state().mesh2params[
                                     param.process_mesh
                                 ].append(param)
-                    amp_global_state().already_classify_params_meshs = True
+                    amp_global_state().already_classify_params_meshes = True
 
                 if len(amp_global_state().mesh2params):
                     for _, params in amp_global_state().mesh2params.items():
diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index 0fd8fce8fe5f8..974daa0a90697 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -270,7 +270,7 @@ def _set_seed(self, flag):
             self.seed = self.initial_seed
 
         if self.seed > np.iinfo(np.uint32).max or self.seed < 0:
-            print("[Warnning: Seed must be between 0 and 2**32 - 1")
+            print("[Warning: Seed must be between 0 and 2**32 - 1")
             self.seed = 123
 
         # get random seed
@@ -616,7 +616,7 @@ def compare_accuracy(
             ...             [1, 5, 2, 0], dtype="float32"
             ...         )
             ...         z1 = x + y
-            ...         out_excel = "compary_accuracy_out_excel.csv"
+            ...         out_excel = "compare_accuracy_out_excel.csv"
             ...         paddle.amp.debugging.compare_accuracy(
             ...             path, path, out_excel, loss_scale=1, dump_all_tensors=False
             ...         )
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 5ddf610bb032b..2843560f4a878 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -18,7 +18,7 @@
 __all__ = []
 
 
-def with_mateclass(meta, *bases):
+def with_metaclass(meta, *bases):
     class impl(meta):
         def __new__(cls, name, temp_bases, attrs):
             return meta(name, bases, attrs)
@@ -267,7 +267,7 @@ def __init__(cls, name, bases, attrs):
         return super().__init__(name, bases, attrs)
 
 
-class PyLayer(with_mateclass(PyLayerMeta, core.eager.PyLayer, PyLayerContext)):
+class PyLayer(with_metaclass(PyLayerMeta, core.eager.PyLayer, PyLayerContext)):
     """
     Paddle implements Python custom operators on the PaddlePaddle framework by creating a subclass of
     ``PyLayer``, which must comply with the following rules:
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 7c7a3d60ebf45..275ab3a232d96 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -104,7 +104,7 @@ def _to_static_var(self, to_parameter=False, **kwargs):
         """
 
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
-        # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
+        # It will fail. So, for property that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = [
             'grad',
             'T',
@@ -227,7 +227,7 @@ def set_value(self, value):
 
             # NOTE(wuweilong): self could be Tensor, the subsequent behavior are defined in different files
             # if self is Tensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc
-            # this Interface behavior will be unifed in the future.
+            # this Interface behavior will be unified in the future.
             if self.is_dist():
                 if isinstance(value, paddle.Tensor) and value.is_dist():
                     from paddle.distributed.auto_parallel.placement_type import (
@@ -702,7 +702,7 @@ def get_device_dtype_from_tensor(other):
 
         if size_args + size_kwargs > 3 or size_args + size_kwargs == 0:
             raise TypeError(
-                "to() received too mant arguments - expected one of:\n  \
+                "to() received too many arguments - expected one of:\n  \
                 * (Union[str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace(), paddle.XPUPlace(), paddle.CustomPlace()] \
                 device, Union[str, paddle.dtype, numpy.dtype] dtype, bool blocking)\n \
                 * (Union[str, paddle.dtype, numpy.dtype] dtype, bool blocking)\n \
@@ -976,7 +976,7 @@ def __array__(self, dtype=None):
         return array
 
     def pre_deal_index(self, item):
-        # since in pybind there is no effiency way to transfer Py_Tuple/Py_List/Py_Range to Tensor
+        # since in pybind there is no efficiency way to transfer Py_Tuple/Py_List/Py_Range to Tensor
         # we call this function in python level.
         item = list(item) if isinstance(item, tuple) else [item]
         for i, slice_item in enumerate(item):
diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
index 742289acd27f1..329cdc25ab083 100644
--- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
@@ -419,7 +419,7 @@ def _serialize(self, pop_keys=["restored_from", "checkpoint_epoch_no"]):
         for k in pop_keys:
             d.pop(k, None)
 
-        # registerd exes
+        # registered exes
         d["exe_status"] = {}
         e = d["exe_status"]
         for k, t in self._exe_status.items():
@@ -625,7 +625,7 @@ def train_epoch_range(max_epoch_num, save_checkpoint_inter=None):
     global g_acp_type
     if not _get_checker().valid():
         logger.warning(
-            "auto checkpoint will take effect  automaticly on PaddleCloud"
+            "auto checkpoint will take effect automatically on PaddleCloud"
         )
         for i in _normal_yield(max_epoch_num):
             yield i
diff --git a/python/paddle/base/layers/io.py b/python/paddle/base/layers/io.py
index 51f5b10fe0618..de9725ec28fac 100644
--- a/python/paddle/base/layers/io.py
+++ b/python/paddle/base/layers/io.py
@@ -74,7 +74,7 @@ def __create_shared_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
-    startop_op = startup_blk.append_op(
+    startup_op = startup_blk.append_op(
         type=op_type,
         inputs={'UnderlyingReader': reader},
         outputs={'Out': [startup_var]},
@@ -83,7 +83,7 @@ def __create_shared_decorated_reader__(op_type, reader, attrs):
     startup_var.persistable = True
     main_prog_block = default_main_program().current_block()
     main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
-    _copy_reader_create_op_(main_prog_block, startop_op)
+    _copy_reader_create_op_(main_prog_block, startup_op)
     return monkey_patch_reader_methods(main_prog_var)
 
 
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index 009cb2ae49a6b..a8128603e05cd 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -86,7 +86,7 @@ def _generate_doc_string_(
         buf.write(" (Tensor): ")
         buf.write(escape_math(each_input.comment))
         if each_input.duplicable:
-            buf.write("  Duplicatable.")
+            buf.write("  Duplicable.")
         if each_input.dispensable:
             buf.write("  Optional.")
         buf.write('\n')
@@ -327,7 +327,7 @@ def func(x, name=None):
                 and x.is_view_var
             ):
                 raise ValueError(
-                    'Sorry about what\'s happend. In to_static mode, {}\'s output variable {} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. You must find the location of the strided API be called, and call {} = {}.assign().'.format(
+                    'Sorry about what\'s happened. In to_static mode, {}\'s output variable {} is a viewed Tensor in dygraph. This will result in inconsistent calculation behavior between dynamic and static graphs. You must find the location of the strided API be called, and call {} = {}.assign().'.format(
                         inplace_op_type, x.name, x.name, x.nameb
                     )
                 )
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index e90378249da03..d5695aec5b220 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -137,7 +137,7 @@ def _check_input_array(cls, item):
         arr = np.asarray(item)
         if arr.dtype == np.object_:
             raise TypeError(
-                "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
+                "\n\tFailed to convert input data to a regular ndarray :\n\t* Usually "
                 "this means the input data contains nested lists with different lengths. "
                 "\n\t* Check the reader function passed to 'decorate_batch_generator'"
                 " to locate the data causes this issue.\n\t* Please consider using "
@@ -532,7 +532,7 @@ def __init__(
         # NOTE: the C++ LoDTensorBlockingQueue instance
         self._blocking_queue = None
         # NOTE: 1. In multiprocess mode, this thread is used to get next batch data from
-        # self._data_queue, then push it into self._blocking_queue; 2. In singleprocess
+        # self._data_queue, then push it into self._blocking_queue; 2. In single process
         # mode, this thread is used to get next batch data from self._batch_reader, then
         # push it into self._blocking_queue
         self._thread = None
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 7618590b376b7..328f3e0078052 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -293,7 +293,7 @@ def _update_input_info(inputs):
 class StaticGraphAdapter:
     """
 
-    Model traning/inference with a static graph.
+    Model training/inference with a static graph.
 
     """
 
@@ -633,7 +633,7 @@ def _make_program(self, mode):
         prog = self._orig_prog.clone()
         # NOTE: When defining learning rate scheduling in static-graph, ops to
         # increase the global step var and calculate learning rate would be
-        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
+        # prepended into _orig_prog. test program marked by `_orig_prog.clone`
         # also would include these ops. Thus must prune these ops in test
         # program, otherwise the global step would be changed in test.
         if mode != 'train':
@@ -794,16 +794,16 @@ def __init__(self, model):
 
         if self._nranks > 1:
             dist.init_parallel_env()
-            stradegy = paddle.distributed.parallel.ParallelStrategy()
-            stradegy.nranks = paddle.distributed.ParallelEnv().nranks
-            stradegy.local_rank = paddle.distributed.ParallelEnv().local_rank
-            stradegy.trainer_endpoints = (
+            strategy = paddle.distributed.parallel.ParallelStrategy()
+            strategy.nranks = paddle.distributed.ParallelEnv().nranks
+            strategy.local_rank = paddle.distributed.ParallelEnv().local_rank
+            strategy.trainer_endpoints = (
                 paddle.distributed.ParallelEnv().trainer_endpoints
             )
-            stradegy.current_endpoint = (
+            strategy.current_endpoint = (
                 paddle.distributed.ParallelEnv().current_endpoint
             )
-            self.ddp_model = paddle.DataParallel(self.model.network, stradegy)
+            self.ddp_model = paddle.DataParallel(self.model.network, strategy)
 
     @property
     def mode(self):
@@ -879,7 +879,7 @@ def eval_batch(self, inputs, labels=None):
 
         outputs = self.model.network(*[paddle.to_tensor(x) for x in inputs])
 
-        # Transfrom data to expected device
+        # Transform data to expected device
         expected_device = paddle.device.get_device()
         for o in to_list(outputs):
             o._to(device=expected_device)
@@ -966,7 +966,7 @@ def load(self, param_state_pairs, optim_state, scaler_state=None):
             if scaler_state:
                 self.model._scaler.load_state_dict(scaler_state)
 
-        # resotre optimizer states
+        # restore optimizer states
         if not self.model._optimizer or not optim_state:
             return
 
@@ -1077,7 +1077,7 @@ class Model:
             or dict ({name: InputSpec}), and it couldn't be None in static
             graph. Default: None.
         labels (InputSpec|list|tuple|None, optional): `labels`, entry points of network,
-            could be a InputSpec instnace or list/tuple of InputSpec instances,
+            could be a InputSpec instance or list/tuple of InputSpec instances,
             or None. For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None. Default: None.
 
@@ -1676,7 +1676,7 @@ def prepare(
     ):
         """
 
-        Configures the model before runing.
+        Configures the model before running.
 
         Args:
             optimizer (Optimizer|None, optional): Optimizer must be set in training
@@ -1777,16 +1777,16 @@ def fit(
         Args:
             train_data (Dataset|DataLoader, optional): An iterable data loader is used for
                 train. An instance of paddle paddle.io.Dataset or
-                paddle.io.Dataloader is recomended. Default: None.
+                paddle.io.Dataloader is recommended. Default: None.
             eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
                 evaluation at the end of epoch. If None, will not do evaluation.
                 An instance of paddle.io.Dataset or paddle.io.Dataloader
-                is recomended. Default: None.
+                is recommended. Default: None.
             batch_size (int|list, optional): The batch size of train_data and eval_data. When
                 train_data and eval_data are both the instance of Dataloader, this
                 parameter will be ignored. Default: 1.
             epochs (int, optional): The number of epochs to train the model. Default: 1.
-            eval_freq (int, optional): The frequency, in number of epochs, an evalutation
+            eval_freq (int, optional): The frequency, in number of epochs, an evaluation
                 is performed. Default: 1.
             log_freq (int, optional): The frequency, in number of steps, the training logs
                 are printed. Default: 10.
@@ -1800,7 +1800,7 @@ def fit(
                 train_data when dataset size is not divisible by the batch size.
                 When train_data is an instance of Dataloader, this parameter
                 will be ignored. Default: False.
-            shuffle (bool, optional): Whther to shuffle train_data. When train_data is
+            shuffle (bool, optional): Whether to shuffle train_data. When train_data is
                 an instance of Dataloader, this parameter will be ignored.
                 Default: True.
             num_workers (int, optional): The number of subprocess to load data, 0 for no
@@ -1810,7 +1810,7 @@ def fit(
             callbacks (Callback|None, optional): A list of `Callback` instances to apply
                 during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
                 :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
-            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
+            accumulate_grad_batches (int, optional): The number of batches to accumulate gradient
                 during training process before optimizer updates. It can mimic large batch
                 size. Default: 1.
             num_iters (int|None, optional): The number of iterations to evaluate the model.
@@ -2016,7 +2016,7 @@ def evaluate(
         Args:
             eval_data (Dataset|DataLoader): An iterable data loader is used for
                 evaluation. An instance of paddle.io.Dataset or
-                paddle.io.Dataloader is recomended.
+                paddle.io.Dataloader is recommended.
             batch_size (int, optional): The batch size of train_data and eval_data.
                 When eval_data is the instance of Dataloader, this argument will be
                 ignored. Default: 1.
@@ -2126,7 +2126,7 @@ def predict(
         Args:
             test_data (Dataset|DataLoader): An iterable data loader is used for
                 predict. An instance of paddle.io.Dataset or paddle.io.Dataloader
-                is recomended.
+                is recommended.
             batch_size (int, optional): The batch size of test_data. When test_data is the
                 instance of Dataloader, this argument will be ignored. Default: 1.
             num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
@@ -2300,13 +2300,13 @@ def _run_one_epoch(
             # Data might come from different types of data_loader and have
             # different format, as following:
             # 1. DataLoader in static graph:
-            #    [[input1, input2, ..., label1, lable2, ...]]
+            #    [[input1, input2, ..., label1, label2, ...]]
             # 2. DataLoader in dygraph
-            #    [input1, input2, ..., label1, lable2, ...]
+            #    [input1, input2, ..., label1, label2, ...]
             # 3. custumed iterator yield concated inputs and labels:
-            #   [input1, input2, ..., label1, lable2, ...]
+            #   [input1, input2, ..., label1, label2, ...]
             # 4. custumed iterator yield separated inputs and labels:
-            #   ([input1, input2, ...], [label1, lable2, ...])
+            #   ([input1, input2, ...], [label1, label2, ...])
             # To handle all of these, flatten (nested) list to list.
             data = paddle.utils.flatten(data)
             # LoDTensor.shape is callable, where LoDTensor comes from
diff --git a/python/paddle/incubate/asp/supported_layer_list.py b/python/paddle/incubate/asp/supported_layer_list.py
index 0ebc6ea2d3128..7720a1cf7127c 100644
--- a/python/paddle/incubate/asp/supported_layer_list.py
+++ b/python/paddle/incubate/asp/supported_layer_list.py
@@ -35,16 +35,16 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     shape = weight_nparray.shape
     weight_pruned_nparray = copy.deepcopy(weight_nparray)
     weight_sparse_mask = np.ones_like(weight_pruned_nparray)
-    exlude_cond_shape2 = len(shape) == 2 and shape[0] < m
-    exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
-    if exlude_cond_shape2:
+    exclude_cond_shape2 = len(shape) == 2 and shape[0] < m
+    exclude_cond_shape4 = len(shape) == 4 and shape[1] < m
+    if exclude_cond_shape2:
         _logger.warning(
             '{} is not pruned because the first dimension of {} is smaller than {}'.format(
                 param_name, shape, m
             )
         )
         return weight_pruned_nparray, weight_sparse_mask
-    if exlude_cond_shape4:
+    if exclude_cond_shape4:
         _logger.warning(
             '{} is not pruned because the second dimension of {} is smaller than {}'.format(
                 param_name, shape, m
@@ -58,12 +58,12 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
     # cuSparseLt would prune matrix A along k dimension.
     # In sparse training, layer weight matrices is viewed sparse matrix A, so
-    # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+    # the math formula should be 'Act(WX + b)'. However, default formula in PaddlePaddle
     #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed
     # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension
-    # of W^T, which is m dimension of W. Moreove, all mask generating functions in
+    # of W^T, which is m dimension of W. Moreover, all mask generating functions in
     # asp/utils is row-major pruning. That is the reason we have to transpose weight
-    # matrices beforce invoking create_mask. Then we transpose the result mask to make
+    # matrices before invoking create_mask. Then we transpose the result mask to make
     # sure its shape to be the same as the input weight.
     weight_sparse_mask = asp.create_mask(
         weight_nparray.T, func_name=func_name, n=n, m=m
diff --git a/python/paddle/incubate/asp/utils.py b/python/paddle/incubate/asp/utils.py
index 4ed8d7e74d56e..f8918a5ed0ced 100644
--- a/python/paddle/incubate/asp/utils.py
+++ b/python/paddle/incubate/asp/utils.py
@@ -171,11 +171,11 @@ def check_mask_1d(mat, n, m):
           True
     """
     if len(mat.shape) <= 1:
-        mat_flattern, shape = _reshape_1d(mat.reshape(1, mat.shape[0]), m)
+        mat_flatten, shape = _reshape_1d(mat.reshape(1, mat.shape[0]), m)
     else:
-        mat_flattern, shape = _reshape_1d(mat, m)
+        mat_flatten, shape = _reshape_1d(mat, m)
 
-    for sub_mat in mat_flattern:
+    for sub_mat in mat_flatten:
         if np.nonzero(sub_mat)[0].size > (m - n):
             return False
     return True
@@ -210,12 +210,12 @@ def get_mask_1d(mat, n, m):
           >>> print(y)
           True
     """
-    mat_flattern, shape = _reshape_1d(mat, m)
+    mat_flatten, shape = _reshape_1d(mat, m)
 
-    mask_flattern = np.ones_like(mat_flattern)
+    mask_flattern = np.ones_like(mat_flatten)
     mask = np.ones_like(mat)
-    for i in range(mat_flattern.shape[0]):
-        sub_mat = mat_flattern[i]
+    for i in range(mat_flatten.shape[0]):
+        sub_mat = mat_flatten[i]
         min_order_indices = np.argsort(np.absolute(sub_mat))
         mask_flattern[i, min_order_indices[:n].tolist()] = 0
     mask_flattern = mask_flattern.reshape(shape)
@@ -252,7 +252,7 @@ def _reshape_2d(mat, m):
     mat_padded = np.zeros(new_shape)
     mat_padded[: mat.shape[0], : mat.shape[1]] = mat
 
-    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
+    mat_flatten = np.empty(new_shape).reshape(-1, m * m)
     curr_idx = 0
     for row_start in range(0, mat_padded.shape[0], m):
         row_end = row_start + m
@@ -261,9 +261,9 @@ def _reshape_2d(mat, m):
             sub_mat = np.squeeze(
                 mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
             )
-            mat_flattern[curr_idx] = sub_mat
+            mat_flatten[curr_idx] = sub_mat
             curr_idx += 1
-    return mat_flattern, mat_padded.shape
+    return mat_flatten, mat_padded.shape
 
 
 def check_mask_2d(mat, n, m):
@@ -400,7 +400,7 @@ def get_mask_2d_greedy(mat, n, m):
 
 def _compute_valid_2d_patterns(n, m):
     r"""
-    Compute all vaild 2D `n:m` sparse patterns.
+    Compute all valid 2D `n:m` sparse patterns.
 
     2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
@@ -409,7 +409,7 @@ def _compute_valid_2d_patterns(n, m):
         n (int): n of `n:m` sparse pattern.
         m (int): m of `n:m` sparse pattern.
     Returns:
-        dictionary: A dictionary with key: *m_n* (string) and value: all vaild 2D `n:m` sparse patterns.
+        dictionary: A dictionary with key: *m_n* (string) and value: all valid 2D `n:m` sparse patterns.
     """
     global _valid_2d_patterns_lock
     global _valid_2d_patterns
@@ -442,7 +442,7 @@ def _compute_valid_2d_patterns(n, m):
 def get_mask_2d_best(mat, n, m):
     r"""
     Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`
-    to form sparse matrix with maximun L1 norm .This function would pad each
+    to form sparse matrix with maximum L1 norm .This function would pad each
     dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
     2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
@@ -475,10 +475,10 @@ def get_mask_2d_best(mat, n, m):
     """
     patterns = _compute_valid_2d_patterns(n, m)
 
-    mat_flattern, shape = _reshape_2d(mat, m)
-    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
+    mat_flatten, shape = _reshape_2d(mat, m)
+    mask_flattern = np.ones_like(mat_flatten).reshape(-1, m, m)
     pmax = np.argmax(
-        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
+        np.matmul(mat_flatten, patterns.reshape(patterns.shape[0], m * m).T),
         axis=1,
     )
 
@@ -502,7 +502,7 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
 
     Args:
         tensor (nparray): The input tensor.
-        func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
+        func_name (MaskAlgo, optional): The function name to generate sparse mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
         n (int, optional): n of `n:m` sparse pattern. Default is 2.
         m (int, optional): m of `n:m` sparse pattern. Default is 4.
     Returns:
@@ -573,7 +573,7 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
 
     Args:
         tensor (nparray): The input tensor.
-        func_name (CheckMethod, optional): The function name to generate spase mask. Default is `CheckMethod.CHECK_1D`. All options please refer to `CheckMethod`.
+        func_name (CheckMethod, optional): The function name to generate sparse mask. Default is `CheckMethod.CHECK_1D`. All options please refer to `CheckMethod`.
         n (int, optional): n of `n:m` sparse pattern. Default is 2.
         m (int, optional): m of `n:m` sparse pattern. Default is 4.
     Returns:
@@ -605,7 +605,7 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
     t = tensor.astype(float)
 
     assert type(func_name) == CheckMethod, (
-        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
+        "func_name argument of check_sparsity is only accepted as type CheckMethod. "
         f"But got {type(func_name)}"
     )
     func = getattr(sys.modules[__name__], func_name.value, None)
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 9f62d1f5835c7..d0c7d41ef194d 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -74,13 +74,13 @@ def forward_grad(outputs, inputs, grad_inputs=None):
 
     if not isinstance(outputs, (framework.Variable, typing.Sequence)):
         raise TypeError(
-            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'Expected outputs is Tensor|Sequence[Tensor], '
             f'but got {type(outputs)}.'
         )
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
         raise TypeError(
-            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'Expected inputs is Tensor|Sequence[Tensor], '
             f'but got {type(inputs)}.'
         )
 
@@ -165,13 +165,13 @@ def grad(outputs, inputs, grad_outputs=None):
 
     if not isinstance(outputs, (framework.Variable, typing.Sequence)):
         raise TypeError(
-            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'Expected outputs is Tensor|Sequence[Tensor], '
             f'but got {type(outputs)}.'
         )
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
         raise TypeError(
-            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'Expected inputs is Tensor|Sequence[Tensor], '
             f'but got {type(inputs)}.'
         )
 
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
index 745ac9fc69c07..c99b3498946c4 100644
--- a/python/paddle/incubate/autotune.py
+++ b/python/paddle/incubate/autotune.py
@@ -136,10 +136,10 @@ def set_config(config=None):
                 )
     if "dataloader" in config_dict:
         dataloader_config = config_dict["dataloader"]
-        use_autoune = False
+        use_autotune = False
         if "enable" in dataloader_config:
             if isinstance(dataloader_config['enable'], bool):
-                use_autoune = dataloader_config['enable']
+                use_autotune = dataloader_config['enable']
             else:
                 warnings.warn(
                     "The auto-tuning configuration of the dataloader is incorrect."
@@ -148,11 +148,11 @@ def set_config(config=None):
         if "tuning_steps" in dataloader_config:
             if isinstance(dataloader_config['tuning_steps'], int):
                 paddle.io.reader.set_autotune_config(
-                    use_autoune, dataloader_config['tuning_steps']
+                    use_autotune, dataloader_config['tuning_steps']
                 )
             else:
                 warnings.warn(
                     "The auto-tuning configuration of the dataloader is incorrect."
                     "The `tuning_steps` should be int. Use default parameter instead."
                 )
-                paddle.io.reader.set_autotune_config(use_autoune)
+                paddle.io.reader.set_autotune_config(use_autotune)
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
index f810014e93b3b..c6b6eec025107 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -257,14 +257,14 @@ def _init_transpiler_server(self, model_dir=None):
             sparse_varnames = self.compiled_config.get_sparse_varname_on_ps(
                 True
             )
-            distribtued_varnames = (
+            distributed_varnames = (
                 self.compiled_config.get_sparse_varname_on_ps(False)
             )
 
             remaining_vars = list(
                 filter(
                     FleetTranspiler.__exclude_vars(
-                        sparse_varnames + distribtued_varnames
+                        sparse_varnames + distributed_varnames
                     ),
                     self.main_program.list_vars(),
                 )
@@ -282,7 +282,7 @@ def _init_transpiler_server(self, model_dir=None):
             )
 
             # todo(tangwei12) load distributed vars
-            # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
+            # self._load_sparse_params(dirname=model_dir, varnames=distributed_varnames)
 
     def init_server(self, model_dir=None, **kwargs):
         """
diff --git a/python/paddle/jit/dy2static/transformers/decorator_transformer.py b/python/paddle/jit/dy2static/transformers/decorator_transformer.py
index 143d1fb1e14d7..c19ce1f95b587 100644
--- a/python/paddle/jit/dy2static/transformers/decorator_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/decorator_transformer.py
@@ -56,13 +56,13 @@ def visit_FunctionDef(self, node):
 
         # every decorator will append a node
         decofun_nodes = []
-        # func to be decoed next time
+        # func to be decoded next time
         deco_target = '_orig_' + node.name
-        # last decoed func
-        decoed_func = ''
+        # last decoded func
+        decoded_func = ''
 
         for deco in reversed(deco_list):
-            # skip INGNORE_NAMES
+            # skip IGNORE_NAMES
             deco_full_name = ast_to_source_code(deco).strip()
             if isinstance(deco, gast.Call):
                 # match case like :
@@ -90,7 +90,7 @@ def visit_FunctionDef(self, node):
                     "Dy2Static : A context manager decorator is used, this may not work correctly after transform."
                 )
 
-            decoed_func = '_decoedby_' + deco_name
+            decoded_func = '_decoedby_' + deco_name
 
             # get function after decoration
             if isinstance(deco, gast.Call):
@@ -104,7 +104,7 @@ def visit_FunctionDef(self, node):
                     re_args = rematch.group(2)
                     re_args_with_func = deco_target + ', ' + re_args
                     decofun_str = 'try:\n\t{0} = _jst.Call({1})({2})\nexcept:\n\t{0} = _jst.Call({1})({3})({4})'.format(
-                        decoed_func,
+                        decoded_func,
                         re_name,
                         re_args_with_func,
                         re_args,
@@ -117,7 +117,7 @@ def visit_FunctionDef(self, node):
                     re_args = rematch.group(2)
                     re_args_with_func = deco_target + ', ' + re_args
                     decofun_str = 'try:\n\t{0} = {1}({2})\nexcept:\n\t{0} = {1}({3})({4})'.format(
-                        decoed_func,
+                        decoded_func,
                         re_name,
                         re_args_with_func,
                         re_args,
@@ -126,11 +126,11 @@ def visit_FunctionDef(self, node):
 
             else:
                 decofun_str = '{} = _jst.Call({})({})'.format(
-                    decoed_func, deco_full_name, deco_target
+                    decoded_func, deco_full_name, deco_target
                 )
 
             decofun_nodes.extend(gast.parse(decofun_str).body)
-            deco_target = decoed_func
+            deco_target = decoded_func
 
         if not decofun_nodes:
             return node
@@ -146,7 +146,7 @@ def visit_FunctionDef(self, node):
 
         args = [arg.id for arg in node.args.args]
         arg_str = ','.join(args)
-        callfun_str = f'return {decoed_func}({arg_str})'
+        callfun_str = f'return {decoded_func}({arg_str})'
         callfun_node = gast.parse(callfun_str).body[0]
 
         node.body = [orig_func_node] + decofun_nodes + [callfun_node]
diff --git a/python/paddle/jit/dy2static/transformers/tensorhook_transformer.py b/python/paddle/jit/dy2static/transformers/tensorhook_transformer.py
index b0a5c56063ab4..04abaa34ef38b 100644
--- a/python/paddle/jit/dy2static/transformers/tensorhook_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/tensorhook_transformer.py
@@ -38,7 +38,7 @@ def transform(self):
         self.visit(self.root)
 
     def reorder_block_statements(self, stmts):
-        regisiter_hook_nodes = [
+        register_hook_nodes = [
             n
             for n in stmts
             for stmt in gast.walk(n)
@@ -46,7 +46,7 @@ def reorder_block_statements(self, stmts):
         ]
         # Analyze the register_hook nodes name dependency
         dependents = {}
-        for n in regisiter_hook_nodes:
+        for n in register_hook_nodes:
             if n not in stmts:
                 continue
             for load_node in get_loads(n):
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 582dd370aa4b4..ce1c26afcb333 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -309,7 +309,7 @@ def func_prefix(func):
 
     global DEL_TEMP_DIR
     if delete_on_exit and DEL_TEMP_DIR:
-        # Clear temporary files in TEMP_DIR while exitting Python process
+        # Clear temporary files in TEMP_DIR while exiting Python process
         atexit.register(remove_if_exit, dir_path=temp_dir)
         DEL_TEMP_DIR = False
 
@@ -576,16 +576,16 @@ def name_judge():
 @signature_safe_contextmanager
 def backend_guard(backend):
     core.check_and_set_prim_all_enabled()
-    orign_fwd = core._is_fwd_prim_enabled()
-    orign_bwd = core._is_bwd_prim_enabled()
+    origin_fwd = core._is_fwd_prim_enabled()
+    origin_bwd = core._is_bwd_prim_enabled()
 
     if backend == 'CINN':
         core._set_prim_all_enabled(True)
     try:
         yield
     finally:
-        core._set_prim_forward_enabled(orign_fwd)
-        core._set_prim_backward_enabled(orign_bwd)
+        core._set_prim_forward_enabled(origin_fwd)
+        core._set_prim_backward_enabled(origin_bwd)
 
 
 def construct_grad_names(grad_info_map, x_vars, param_vars, out_vars):
diff --git a/python/paddle/jit/sot/symbolic/export.py b/python/paddle/jit/sot/symbolic/export.py
index 720ef70730d20..39b06eca1891c 100644
--- a/python/paddle/jit/sot/symbolic/export.py
+++ b/python/paddle/jit/sot/symbolic/export.py
@@ -31,8 +31,8 @@ def __init__(self, *lines):
 
     def get_lines(self, prefix=""):
         lines = [prefix + line for line in self.lines]
-        for statment in self.sub_statement:
-            lines.extend(statment.get_lines(self.tab + prefix))
+        for statement in self.sub_statement:
+            lines.extend(statement.get_lines(self.tab + prefix))
         return lines
 
     def add_sub(self, *lines):
@@ -302,7 +302,7 @@ def create_tail(self):
         )
 
     def init_sub_layer(self, layer, layer_name):
-        # TODO @wuzhanfei need more effecient way to create a sub layer
+        # TODO @wuzhanfei need more efficient way to create a sub layer
         # now, we just close call_Layer behavior
         raise ExportError("Not support create sub layer now.")
 
@@ -385,4 +385,6 @@ def export(SIR, path):
 
     with open(os.path.join(path, f"{SIR.name}.py"), "w") as f:
         f.write(string)
-        print(f"[SOT] Export {SIR.name} Sucess with size {len(SIR.statements)}")
+        print(
+            f"[SOT] Export {SIR.name} Success with size {len(SIR.statements)}"
+        )
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index f057a261e9da7..a931912ae9572 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1132,7 +1132,7 @@ def multiply_(x, y, name=None):
     return _C_ops.multiply_(x, y)
 
 
-def _elementwise_op_with_axis(x, y, axis=-1, name=None, op_type="Undifined"):
+def _elementwise_op_with_axis(x, y, axis=-1, name=None, op_type="Undefined"):
     assert (
         in_dynamic_or_pir_mode()
     ), "You can only call `_elementwise_op_with_axis` function within in_dynamic_or_pir_mode"
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 35bda07cab67b..b48f9fcaa2c28 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -589,7 +589,7 @@ def win_custom_spawn(cmd):
             finally:
                 self.compiler.spawn = original_spawn
 
-        def object_filenames_with_cuda(origina_func, build_directory):
+        def object_filenames_with_cuda(original_func, build_directory):
             """
             Decorated the function to add customized naming mechanism.
             Originally, both .cc/.cu will have .o object output that will
@@ -598,7 +598,7 @@ def object_filenames_with_cuda(origina_func, build_directory):
 
             def wrapper(source_filenames, strip_dir=0, output_dir=''):
                 try:
-                    objects = origina_func(
+                    objects = original_func(
                         source_filenames, strip_dir, output_dir
                     )
                     for i, source in enumerate(source_filenames):
@@ -618,7 +618,7 @@ def wrapper(source_filenames, strip_dir=0, output_dir=''):
                     # ensure to use abspath
                     objects = [os.path.abspath(obj) for obj in objects]
                 finally:
-                    self.compiler.object_filenames = origina_func
+                    self.compiler.object_filenames = original_func
 
                 return objects
 

From bb2943881ca9927ad9b08f1f460f90707ec901fc Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:39:58 +0800
Subject: [PATCH 29/55]  Fix distribuions distributions, etc (#62161)

---
 test/distribution/test_distribution_categorical.py | 2 +-
 test/xpu/test_adamw_fp16_xpu.py                    | 2 +-
 test/xpu/test_argsort_op_xpu.py                    | 4 ++--
 test/xpu/test_collective_allgather_xpu.py          | 4 ++--
 test/xpu/test_collective_allreduce_xpu.py          | 4 ++--
 test/xpu/test_collective_broadcast_xpu.py          | 4 ++--
 test/xpu/test_collective_process_group_xpu.py      | 2 +-
 test/xpu/test_collective_reduce_xpu.py             | 4 ++--
 test/xpu/test_device_guard_xpu.py                  | 4 ++--
 test/xpu/test_scatter_nd_add_op_xpu.py             | 6 +++---
 10 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/distribution/test_distribution_categorical.py b/test/distribution/test_distribution_categorical.py
index d87c72e73438c..8be8b31672a9d 100644
--- a/test/distribution/test_distribution_categorical.py
+++ b/test/distribution/test_distribution_categorical.py
@@ -313,7 +313,7 @@ def get_numpy_selected_probs(self, probability):
 class CategoricalTest7(CategoricalTest):
     def init_numpy_data(self, batch_size, dims):
         # input logtis is 3-D Tensor
-        # value used in probs and log_prob method has the same number of distribuions with input
+        # value used in probs and log_prob method has the same number of distributions with input
         self.logits_np = np.random.rand(3, 2, 5).astype('float32')
         self.other_logits_np = np.random.rand(3, 2, 5).astype('float32')
         self.value_np = np.array([2, 1, 3]).astype('int64')
diff --git a/test/xpu/test_adamw_fp16_xpu.py b/test/xpu/test_adamw_fp16_xpu.py
index ca7c799312410..e9a6b1540fa49 100644
--- a/test/xpu/test_adamw_fp16_xpu.py
+++ b/test/xpu/test_adamw_fp16_xpu.py
@@ -59,7 +59,7 @@ def test_state_dict(self):
         state_dict_1["linear_0.b_0_moment1_0.SCALE_VALUE"] = 12.3125
         adam.set_state_dict(state_dict_1)
 
-        # check overwrited value
+        # check overwritten value
         state_dict_2 = adam.state_dict()
         self.assertTrue("linear_0.w_0_moment1_0.SCALE_VALUE" in state_dict_2)
         self.assertTrue("linear_0.b_0_moment1_0.SCALE_VALUE" in state_dict_2)
diff --git a/test/xpu/test_argsort_op_xpu.py b/test/xpu/test_argsort_op_xpu.py
index f3a8a69ee5ded..c8ddebf859ecd 100644
--- a/test/xpu/test_argsort_op_xpu.py
+++ b/test/xpu/test_argsort_op_xpu.py
@@ -165,7 +165,7 @@ def init_test_case(self):
                 2,
                 8732,
                 1,
-            ]  # test for 8192 < n <= 10240 + nees_transpose
+            ]  # test for 8192 < n <= 10240 + need_transpose
             self.axis = 1
 
     class TestArgsortOpCase4(TestArgsortOpCase1):
@@ -174,7 +174,7 @@ def init_test_case(self):
                 2,
                 10241,
                 1,
-            ]  # test for 10240 < n <= 16384 + nees_transpose
+            ]  # test for 10240 < n <= 16384 + need_transpose
             self.axis = 1
 
 
diff --git a/test/xpu/test_collective_allgather_xpu.py b/test/xpu/test_collective_allgather_xpu.py
index ad232cba70a88..55f516337baff 100644
--- a/test/xpu/test_collective_allgather_xpu.py
+++ b/test/xpu/test_collective_allgather_xpu.py
@@ -29,7 +29,7 @@ def _setup_config(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_allgather(self):
         support_types = get_xpu_op_support_types('c_allgather')
@@ -40,7 +40,7 @@ def test_allgather(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_allgather_dygraph(self):
         support_types = get_xpu_op_support_types('c_allgather')
diff --git a/test/xpu/test_collective_allreduce_xpu.py b/test/xpu/test_collective_allreduce_xpu.py
index 4d8797cc0972f..c52ca781f35af 100644
--- a/test/xpu/test_collective_allreduce_xpu.py
+++ b/test/xpu/test_collective_allreduce_xpu.py
@@ -29,7 +29,7 @@ def _setup_config(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_allreduce(self):
         support_types = get_xpu_op_support_types('c_allreduce_sum')
@@ -42,7 +42,7 @@ def test_allreduce(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_allreduce_dygraph(self):
         support_types = get_xpu_op_support_types('c_allreduce_sum')
diff --git a/test/xpu/test_collective_broadcast_xpu.py b/test/xpu/test_collective_broadcast_xpu.py
index 7fa695b321781..91e3024ee3838 100644
--- a/test/xpu/test_collective_broadcast_xpu.py
+++ b/test/xpu/test_collective_broadcast_xpu.py
@@ -29,7 +29,7 @@ def _setup_config(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_broadcast(self):
         support_types = get_xpu_op_support_types('c_broadcast')
@@ -42,7 +42,7 @@ def test_broadcast(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_broadcast_dygraph(self):
         support_types = get_xpu_op_support_types('c_broadcast')
diff --git a/test/xpu/test_collective_process_group_xpu.py b/test/xpu/test_collective_process_group_xpu.py
index ec351b857ab93..166b1e6707596 100644
--- a/test/xpu/test_collective_process_group_xpu.py
+++ b/test/xpu/test_collective_process_group_xpu.py
@@ -23,7 +23,7 @@
 class TestProcessGroup(TestMultipleXpus):
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_process_group_bkcl(self):
         self.run_mnist_2xpu('process_group_bkcl.py')
diff --git a/test/xpu/test_collective_reduce_xpu.py b/test/xpu/test_collective_reduce_xpu.py
index be5eccdc9a0e8..b36e3e3be5203 100644
--- a/test/xpu/test_collective_reduce_xpu.py
+++ b/test/xpu/test_collective_reduce_xpu.py
@@ -29,7 +29,7 @@ def _setup_config(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_reduce(self):
         support_types = get_xpu_op_support_types('c_reduce_sum')
@@ -42,7 +42,7 @@ def test_reduce(self):
 
     @unittest.skipIf(
         not core.is_compiled_with_xpu() or paddle.device.xpu.device_count() < 2,
-        "run test when having at leaset 2 XPUs.",
+        "run test when having at least 2 XPUs.",
     )
     def test_reduce_dygraph(self):
         support_types = get_xpu_op_support_types('c_reduce_sum')
diff --git a/test/xpu/test_device_guard_xpu.py b/test/xpu/test_device_guard_xpu.py
index ce85946aee74e..bcc9e85839bee 100644
--- a/test/xpu/test_device_guard_xpu.py
+++ b/test/xpu/test_device_guard_xpu.py
@@ -31,7 +31,7 @@ def execute(main_program, startup_program):
     exe.run(main_program)
 
 
-def get_vaild_warning_num(warning, w):
+def get_valid_warning_num(warning, w):
     num = 0
     for i in range(len(w)):
         if warning in str(w[i].message):
@@ -160,7 +160,7 @@ def test_without_kernel_op(self):
                         paddle.assign(paddle.less_than(x=i, y=loop_len), cond)
 
         warning = "The Op(while) is not support to set device."
-        warning_num = get_vaild_warning_num(warning, w)
+        warning_num = get_valid_warning_num(warning, w)
         assert warning_num == 1
 
         all_ops = main_program.global_block().ops
diff --git a/test/xpu/test_scatter_nd_add_op_xpu.py b/test/xpu/test_scatter_nd_add_op_xpu.py
index 6efb4fec3b0f7..d8733dd1a1e83 100644
--- a/test/xpu/test_scatter_nd_add_op_xpu.py
+++ b/test/xpu/test_scatter_nd_add_op_xpu.py
@@ -34,11 +34,11 @@ def numpy_scatter_nd(ref, index, updates, fun):
     end_size = index_shape[-1]
 
     # as type int32, flat_index or flat_updates can't reshape to int64
-    remain_numl = np.prod(index_shape[:-1]).astype("int32")
+    remain_numel = np.prod(index_shape[:-1]).astype("int32")
     slice_size = np.prod(ref_shape[end_size : len(ref_shape)]).astype("int32")
 
-    flat_index = index.reshape([remain_numl] + list(index_shape[-1:]))
-    flat_updates = updates.reshape((remain_numl, slice_size))
+    flat_index = index.reshape([remain_numel] + list(index_shape[-1:]))
+    flat_updates = updates.reshape((remain_numel, slice_size))
     flat_output = ref.reshape(list(ref_shape[:end_size]) + [slice_size])
 
     for i_up, i_out in enumerate(flat_index):

From 16dfd859811df562480584a9b17cb589ccadcce2 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:40:29 +0800
Subject: [PATCH 30/55]  Fix precsion precision, etc (#62160)

---
 paddle/fluid/pir/drr/README.md                |  4 +--
 paddle/fluid/pir/drr/README_cn.md             |  4 +--
 .../transforms/auto_mixed_precision_pass.cc   |  2 +-
 .../pir/transforms/identity_op_clean_pass.cc  | 26 +++++++++----------
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/pir/drr/README.md b/paddle/fluid/pir/drr/README.md
index 1c5de89780c6f..d9b435160c41d 100644
--- a/paddle/fluid/pir/drr/README.md
+++ b/paddle/fluid/pir/drr/README.md
@@ -9,9 +9,9 @@ DRR can reduce the development cost of PASS, allowing developers to focus on pro
 Taking PASS to eliminate redundant CastOp as an example, the code example developed using DRR is as follows:
 ~~~ c++
 // 1. Inherit class from DrPatternBase
-class RemoveRedundentCastPattern : public paddle::drr::DrrPatternBase {
+class RemoveRedundantCastPattern : public paddle::drr::DrrPatternBase {
 public:
-  std::string name() const override { return "RemoveRedundentCastPattern"; }
+  std::string name() const override { return "RemoveRedundantCastPattern"; }
 
   // 2. Overload operator()
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
diff --git a/paddle/fluid/pir/drr/README_cn.md b/paddle/fluid/pir/drr/README_cn.md
index e621e7112ac30..c01b21febeda3 100644
--- a/paddle/fluid/pir/drr/README_cn.md
+++ b/paddle/fluid/pir/drr/README_cn.md
@@ -9,9 +9,9 @@ DRR ( Declarative Rewrite Rule ) 是来处理这种 DAG-to-DAG 类型的一套 P
 以消除冗余 CastOp 的 PASS 为例，使用 DRR 的代码开发示例如下：
 ~~~ c++
 // 1. 继承 DrrPatternBase 类
-class RemoveRedundentCastPattern : public paddle::drr::DrrPatternBase {
+class RemoveRedundantCastPattern : public paddle::drr::DrrPatternBase {
 public:
-	std::string name() const override { return "RemoveRedundentCastPattern"; }
+	std::string name() const override { return "RemoveRedundantCastPattern"; }
 
   // 2. 重载 operator()
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
diff --git a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
index dee9aad09ed1d..1ff6b34565ed0 100644
--- a/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/pir/transforms/auto_mixed_precision_pass.cc
@@ -230,7 +230,7 @@ class AutoMixedPrecisionPass : public pir::Pass {
           if (!op->operand_source(idx)) continue;
           auto operand = op->operand(idx);
           if (operand.type() && operand.type().isa<pir::VectorType>()) {
-            // check if there are all float in the vectortype
+            // check if there are all float in the vector type
             auto vec_type = operand.type().dyn_cast<pir::VectorType>();
             if (IsVectorTypeFloat(vec_type)) {
               auto input_operation = GetDefiningOpForInput(op, idx);
diff --git a/paddle/fluid/pir/transforms/identity_op_clean_pass.cc b/paddle/fluid/pir/transforms/identity_op_clean_pass.cc
index cf27800512b0b..32346997cd6c9 100644
--- a/paddle/fluid/pir/transforms/identity_op_clean_pass.cc
+++ b/paddle/fluid/pir/transforms/identity_op_clean_pass.cc
@@ -53,9 +53,9 @@ class RemoveUselessScalePattern : public paddle::drr::DrrPatternBase {
   }
 };
 
-class RemoveRedundentScalePattern : public paddle::drr::DrrPatternBase {
+class RemoveRedundantScalePattern : public paddle::drr::DrrPatternBase {
  public:
-  std::string name() const override { return "RemoveRedundentScalePattern"; }
+  std::string name() const override { return "RemoveRedundantScalePattern"; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
@@ -83,7 +83,7 @@ class RemoveRedundentScalePattern : public paddle::drr::DrrPatternBase {
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
 
-    const auto &bais_attr = res.ComputeAttr(
+    const auto &bias_attr = res.ComputeAttr(
         [](const paddle::drr::MatchContext &match_ctx) -> float {
           float res_bias_1 = 0.f;
           float res_bias_2 = 0.f;
@@ -115,7 +115,7 @@ class RemoveRedundentScalePattern : public paddle::drr::DrrPatternBase {
                                       {"place", pat.Attr("place_1")}});
     const auto &scale_op_res =
         res.Op("pd_op.scale",
-               {{"bias", bais_attr}, {"bias_after_scale", res.BoolAttr(true)}});
+               {{"bias", bias_attr}, {"bias_after_scale", res.BoolAttr(true)}});
     scale_op_res({&res.Tensor("x"), &full_op_res()},
                  {&res.Tensor("scale_2_out")});
   }
@@ -154,9 +154,9 @@ class RemoveUselessConcatPattern : public paddle::drr::DrrPatternBase {
   }
 };
 
-class RemoveRedundentCastPattern : public paddle::drr::DrrPatternBase {
+class RemoveRedundantCastPattern : public paddle::drr::DrrPatternBase {
  public:
-  std::string name() const override { return "RemoveRedundentCastPattern"; }
+  std::string name() const override { return "RemoveRedundantCastPattern"; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     auto pat = ctx->SourcePattern();
@@ -245,10 +245,10 @@ class ReplaceDropoutWithScalePattern : public paddle::drr::DrrPatternBase {
   }
 };
 
-class RemoveRedundentTransposePattern : public paddle::drr::DrrPatternBase {
+class RemoveRedundantTransposePattern : public paddle::drr::DrrPatternBase {
  public:
   std::string name() const override {
-    return "RemoveRedundentTransposePattern";
+    return "RemoveRedundantTransposePattern";
   }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
@@ -271,10 +271,10 @@ class RemoveRedundentTransposePattern : public paddle::drr::DrrPatternBase {
           }
           return new_perm;
         });
-    const auto &tranpose_continuous =
+    const auto &transpose_continuous =
         res.Op("pd_op.transpose", {{"perm", new_perm_attr}});
 
-    res.Tensor("ret") = tranpose_continuous(res.Tensor("arg_transpose"));
+    res.Tensor("ret") = transpose_continuous(res.Tensor("arg_transpose"));
   }
 };
 
@@ -286,13 +286,13 @@ class IdentityOpCleanPass : public pir::PatternRewritePass {
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
     pir::RewritePatternSet ps(context);
     ps.Add(paddle::drr::Create<RemoveUselessScalePattern>(context));
-    ps.Add(paddle::drr::Create<RemoveRedundentScalePattern>(context));
+    ps.Add(paddle::drr::Create<RemoveRedundantScalePattern>(context));
     ps.Add(paddle::drr::Create<RemoveUselessCastPattern>(context));
     ps.Add(paddle::drr::Create<RemoveUselessConcatPattern>(context));
-    ps.Add(paddle::drr::Create<RemoveRedundentCastPattern>(context));
+    ps.Add(paddle::drr::Create<RemoveRedundantCastPattern>(context));
     ps.Add(paddle::drr::Create<DeleteDropoutOpPattern>(context));
     ps.Add(paddle::drr::Create<ReplaceDropoutWithScalePattern>(context));
-    ps.Add(paddle::drr::Create<RemoveRedundentTransposePattern>(context));
+    ps.Add(paddle::drr::Create<RemoveRedundantTransposePattern>(context));
     return ps;
   }
 };

From c422cc561a6bc26151152e82ba387096ab453b01 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:41:43 +0800
Subject: [PATCH 31/55] Fix quantdequant quant_dequant (#62046)

* Fix

* ci

* ci

* ci

* ci
---
 .../ir/delete_quant_dequant_filter_op_pass.cc          |  4 ++--
 .../ir/delete_quant_dequant_linear_op_pass.cc          |  2 +-
 .../fluid/framework/ir/delete_quant_dequant_op_pass.cc |  8 ++++----
 paddle/fluid/framework/ir/graph_pattern_detector.cc    | 10 +++++-----
 paddle/fluid/framework/ir/graph_pattern_detector.h     |  6 +++---
 .../ir/trt_delete_weight_dequant_linear_op_pass.cc     |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
index cfe644a61ea51..3bd051c597179 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
@@ -73,7 +73,7 @@ DeleteQuantDequantFilterOpPass::DeleteQuantDequantFilterOpPass() {
 }
 // Delete quant_dequant_op, then quantize and dequantize weight
 void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "delete_quantdequant_filter_op_pattern";
+  const std::string pattern_name = "delete_quant_dequant_filter_op_pattern";
   FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
@@ -141,7 +141,7 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
                             "the received is %d",
                             quant_axis));
 
-      // To Do @Wangzheee: use "OutScale" to quantdequant
+      // To Do @Wangzheee: use "OutScale" to quant_dequant
       /*auto scales_name = quant_dequant_op->Op()->Output("OutScale");
       PADDLE_ENFORCE_EQ(scales_name.size(), 1,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 7358a82c6ca3c..9d4006e6f3943 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -86,7 +86,7 @@ DeleteQuantDequantLinearOpPass::DeleteQuantDequantLinearOpPass() {
 }
 // Delete quantize_linear_op dequantize_linear_op, then add input_scales
 void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "delete_quantdequant_linear_op_pattern";
+  const std::string pattern_name = "delete_quant_dequant_linear_op_pattern";
   FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index ebb0ed9d00dc1..2a7071d54843d 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -32,21 +32,21 @@ namespace ir {
   GET_IR_NODE(quant_dequant_op_out);
 
 void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "delete_quantdequant_op_pattern";
+  const std::string pattern_name = "delete_quant_dequant_op_pattern";
   FusePassBase::Init(pattern_name, graph);
   GraphPatternDetector gpd;
 
-  std::string quantdequant_types =
+  std::string quant_dequant_types =
       "fake_quantize_dequantize_moving_average_abs_max";
 
   auto* input_node = gpd.mutable_pattern()
                          ->NewNode("input_node")
-                         ->assert_is_op_input(quantdequant_types, "X")
+                         ->assert_is_op_input(quant_dequant_types, "X")
                          ->AsInput();
 
   patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(),
                                                 pattern_name);
-  pattern(input_node, quantdequant_types);
+  pattern(input_node, quant_dequant_types);
   auto* scope = param_scope();
   int found_count = 0;
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index df804cf0d4f7b..034780ac0d0b8 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -3519,22 +3519,22 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
 }
 
 void patterns::DeleteQuantDequantOpPattern::operator()(
-    PDNode *input_node, const std::string &quantdequant_types) {
+    PDNode *input_node, const std::string &quant_dequant_types) {
   auto quant_dequant_op_inscale =
       pattern->NewNode(quant_dequant_op_inscale_repr())
-          ->assert_is_op_input(quantdequant_types, "InScale")
+          ->assert_is_op_input(quant_dequant_types, "InScale")
           ->AsInput();
   auto quant_dequant_op = pattern->NewNode(quant_dequant_op_repr())
-                              ->assert_is_op(quantdequant_types);
+                              ->assert_is_op(quant_dequant_types);
 
   auto quant_dequant_op_out =
       pattern->NewNode(quant_dequant_op_out_repr())
-          ->assert_is_op_output(quantdequant_types, "Out")
+          ->assert_is_op_output(quant_dequant_types, "Out")
           ->AsOutput();
 
   auto quant_dequant_op_outscale =
       pattern->NewNode(quant_dequant_op_outscale_repr())
-          ->assert_is_op_output(quantdequant_types, "OutScale")
+          ->assert_is_op_output(quant_dequant_types, "OutScale")
           ->AsOutput();
 
   quant_dequant_op->LinksFrom({quant_dequant_op_inscale, input_node});
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 22d88e96b2852..4eac3440a4514 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1869,9 +1869,9 @@ struct DeleteDropoutOpPattern : public PatternBase {
 
 struct DeleteQuantDequantOpPattern : public PatternBase {
   DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
+      : PatternBase(pattern, name_scope, "delete_quant_dequant_op_pattern") {}
 
-  void operator()(PDNode* input_node, const std::string& quantdequant_types);
+  void operator()(PDNode* input_node, const std::string& quant_dequant_types);
 
   PATTERN_DECL_NODE(quant_dequant_op_inscale);
   PATTERN_DECL_NODE(quant_dequant_op);
@@ -1883,7 +1883,7 @@ struct DeleteQuantDequantFilterOpPattern : public PatternBase {
   DeleteQuantDequantFilterOpPattern(PDPattern* pattern,
                                     const std::string& name_scope)
       : PatternBase(
-            pattern, name_scope, "delete_quantdequant_filter_op_pattern") {}
+            pattern, name_scope, "delete_quant_dequant_filter_op_pattern") {}
 
   void operator()();
 
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index 6e12933f0f4d5..b780c07fda0a6 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -201,7 +201,7 @@ TrtDeleteWeightQuantDequantLinearOpPass::
 void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
     ir::Graph* graph) const {
   const std::string pattern_name =
-      "delete_weight_quantdequant_linear_op_pattern";
+      "delete_weight_quant_dequant_linear_op_pattern";
   FusePassBase::Init(pattern_name, graph);
 
   GraphPatternDetector gpd;

From 2fb56196c4aaf7af47b512f92f560a3df7de0f07 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Thu, 29 Feb 2024 23:48:10 +0800
Subject: [PATCH 32/55] [Typo error] fix typo error tesnor to tensor (#62175)

---
 paddle/fluid/framework/tensor_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 96f3d71c132af..02aa4b500ce7b 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -103,7 +103,7 @@ void TensorToVector(const phi::DenseTensor& src,
                     const platform::DeviceContext& ctx,
                     std::vector<T>* dst);
 template <typename T>
-void TesnorToVector(const phi::DenseTensor& src, std::vector<T>* dst);
+void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst);
 
 // convert dlpack's DLTensor to tensor
 

From 180c596fb4978047e738767fd14727008dab3fd7 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Thu, 29 Feb 2024 23:49:13 +0800
Subject: [PATCH 33/55] =?UTF-8?q?[clang-tidy]=20fix=20about=2031=E3=80=813?=
 =?UTF-8?q?2=E3=80=8134=E3=80=8141=E3=80=8145=20(#62129)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/framework/io/crypto/aes_cipher.cc            | 8 ++++----
 .../fluid/memory/allocation/naive_best_fit_allocator.cc   | 2 +-
 paddle/fluid/platform/enforce_test.cc                     | 2 +-
 paddle/phi/core/dense_tensor.cc                           | 2 +-
 paddle/phi/core/sparse_coo_tensor.cc                      | 2 +-
 paddle/phi/core/sparse_csr_tensor.cc                      | 2 +-
 paddle/phi/core/string_tensor.cc                          | 2 +-
 paddle/phi/core/utils/intrusive_ref_counter.h             | 2 +-
 paddle/phi/infermeta/spmd_rules/unsqueeze.cc              | 2 +-
 paddle/pir/src/core/builtin_type_interfaces.cc            | 4 ++--
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/io/crypto/aes_cipher.cc b/paddle/fluid/framework/io/crypto/aes_cipher.cc
index 8802dc1b12158..158d25a6957f7 100644
--- a/paddle/fluid/framework/io/crypto/aes_cipher.cc
+++ b/paddle/fluid/framework/io/crypto/aes_cipher.cc
@@ -65,7 +65,7 @@ std::string AESCipher::EncryptInternal(const std::string& plaintext,
   std::string ciphertext;
   m_filter->Attach(new CryptoPP::StringSink(ciphertext));
   CryptoPP::Redirector* filter_redirector = new CryptoPP::Redirector(*m_filter);
-  CryptoPP::StringSource(plaintext, true, filter_redirector);
+  CryptoPP::StringSource ss(plaintext, true, filter_redirector);
   if (need_iv) {
     return iv_ + ciphertext;
   }
@@ -96,7 +96,7 @@ std::string AESCipher::DecryptInternal(const std::string& ciphertext,
   std::string plaintext;
   m_filter->Attach(new CryptoPP::StringSink(plaintext));
   CryptoPP::Redirector* filter_redirector = new CryptoPP::Redirector(*m_filter);
-  CryptoPP::StringSource(
+  CryptoPP::StringSource ss(
       ciphertext.substr(ciphertext_beg), true, filter_redirector);
 
   return plaintext;
@@ -124,7 +124,7 @@ std::string AESCipher::AuthenticatedEncryptInternal(
   std::string ciphertext;
   m_filter->Attach(new CryptoPP::StringSink(ciphertext));
   CryptoPP::Redirector* filter_redirector = new CryptoPP::Redirector(*m_filter);
-  CryptoPP::StringSource(plaintext, true, filter_redirector);
+  CryptoPP::StringSource ss(plaintext, true, filter_redirector);
   if (need_iv) {
     ciphertext = iv_.append(ciphertext);
   }
@@ -155,7 +155,7 @@ std::string AESCipher::AuthenticatedDecryptInternal(
   std::string plaintext;
   m_filter->Attach(new CryptoPP::StringSink(plaintext));
   CryptoPP::Redirector* filter_redirector = new CryptoPP::Redirector(*m_filter);
-  CryptoPP::StringSource(
+  CryptoPP::StringSource ss(
       ciphertext.substr(ciphertext_beg), true, filter_redirector);
   PADDLE_ENFORCE_EQ(
       m_filter->GetLastResult(),
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 612ba0798d2c0..45cf3b44baa8a 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -298,7 +298,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
-    platform::CUDADeviceGuard(place.device);
+    platform::CUDADeviceGuard guard(place.device);
     size_t avail, total;
     platform::GpuMemoryUsage(&avail, &total);
     PADDLE_THROW(platform::errors::ResourceExhausted(
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 9bad3f0bf1c41..e6838746fd6ac 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -594,7 +594,7 @@ TEST(enforce, cannot_to_string_type) {
 }
 
 TEST(GET_DATA_SAFELY_MACRO, SUCCESS) {
-  int* a = new int(10);
+  int* a = new int(10);  // NOLINT
   GET_DATA_SAFELY(a, "Input", "X", "dummy");
 }
 
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index d15cc4eeafda1..8340c4d69c380 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -53,7 +53,7 @@ DenseTensor::DenseTensor(const std::shared_ptr<phi::Allocation>& holder,
                          const DenseTensorMeta& meta)
     : meta_(meta), holder_(holder) {}
 
-DenseTensor::DenseTensor(const DenseTensor& other) {
+DenseTensor::DenseTensor(const DenseTensor& other) {  // NOLINT
   this->meta_ = other.meta();
   holder_ = other.holder_;
   storage_properties_ =
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index dfd519250aa37..d6f41168981aa 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -51,7 +51,7 @@ SparseCooTensor::SparseCooTensor(DenseTensor&& non_zero_indices,
   meta_.dtype = non_zero_elements.dtype();
 }
 
-SparseCooTensor::SparseCooTensor(const SparseCooTensor& other) {
+SparseCooTensor::SparseCooTensor(const SparseCooTensor& other) {  // NOLINT
   this->non_zero_indices_ = other.non_zero_indices_;
   this->non_zero_elements_ = other.non_zero_elements_;
   this->coalesced_ = other.coalesced_;
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index 525f38cd8263d..f4373f528d217 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -66,7 +66,7 @@ SparseCsrTensor::SparseCsrTensor(const DenseTensor& non_zero_crows,
   meta_.dtype = non_zero_elements.dtype();
 }
 
-SparseCsrTensor::SparseCsrTensor(const SparseCsrTensor& other) {
+SparseCsrTensor::SparseCsrTensor(const SparseCsrTensor& other) {  // NOLINT
   this->non_zero_crows_ = other.non_zero_crows_;
   this->non_zero_cols_ = other.non_zero_cols_;
   this->non_zero_elements_ = other.non_zero_elements_;
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index d370be21f4cac..bb7d06825fdbb 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -37,7 +37,7 @@ StringTensor::StringTensor(const std::shared_ptr<phi::Allocation>& holder,
                            const StringTensorMeta& meta)
     : meta_(meta), holder_(holder) {}
 
-StringTensor::StringTensor(const StringTensor& other) {
+StringTensor::StringTensor(const StringTensor& other) {  // NOLINT
   this->meta_ = other.meta();
   holder_ = other.holder_;
 }
diff --git a/paddle/phi/core/utils/intrusive_ref_counter.h b/paddle/phi/core/utils/intrusive_ref_counter.h
index 1681f88af054f..6b2a3e989a840 100644
--- a/paddle/phi/core/utils/intrusive_ref_counter.h
+++ b/paddle/phi/core/utils/intrusive_ref_counter.h
@@ -57,7 +57,7 @@ inline void intrusive_ptr_release(
     const intrusive_ref_counter<DerivedT>* p) noexcept {
   if (p->ref_.load(std::memory_order_acquire) == 0 ||
       p->ref_.fetch_sub(1) == 0) {
-    delete static_cast<const DerivedT*>(p);
+    delete static_cast<const DerivedT*>(p);  // NOLINT
   }
 }
 
diff --git a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
index cbb010fe6c6bf..ef47b31341a73 100644
--- a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
+++ b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
@@ -74,7 +74,7 @@ std::vector<std::shared_ptr<DimTrans>> MakeUnsqueezeDimTransReverse(
   ret.resize(x_ndim);
   fill(ret.begin(), ret.end(), std::make_shared<Singleton>());
 
-  for (int64_t i = 0, j = 0; i < out_ndim; i++) {
+  for (int64_t i = 0, j = 0; i < out_ndim; i++) {  // NOLINT
     auto it = find(axis.begin(), axis.end(), i);
 
     if (it == axis.end()) {
diff --git a/paddle/pir/src/core/builtin_type_interfaces.cc b/paddle/pir/src/core/builtin_type_interfaces.cc
index de0538eacc0d9..5b8d14b74175a 100644
--- a/paddle/pir/src/core/builtin_type_interfaces.cc
+++ b/paddle/pir/src/core/builtin_type_interfaces.cc
@@ -18,11 +18,11 @@
 namespace pir {
 
 Type ShapedTypeInterface::GetElementType() const {
-  return impl_->get_element_type(*this);
+  return impl_->get_element_type(*this);  // NOLINT
 }
 
 pir::DDim ShapedTypeInterface::GetShape() const {
-  return impl_->get_shape(*this);
+  return impl_->get_shape(*this);  // NOLINT
 }
 
 }  // namespace pir

From 23adc6a42e7f1ee0d38df689b1a12449a156c3b0 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Fri, 1 Mar 2024 09:46:44 +0800
Subject: [PATCH 34/55] [PIR][DynamicShape] Add shape pass to inference
 predictor (#62167)

* [PIR][DynamicShape] Add shape pass to inference predictor

* move decomp case

* fix ci
---
 .../fluid/inference/api/analysis_predictor.cc | 10 ++++-
 .../pir/transforms/shape_optimization_pass.cc | 38 +++++++++++++++++++
 .../pir/transforms/shape_optimization_pass.h  | 10 +++++
 paddle/fluid/pybind/pir.cc                    | 21 +---------
 test/ir/pir/cinn/symbolic/CMakeLists.txt      | 14 +++++++
 .../test_decomp_inference_predictor_run.py    |  7 ++--
 6 files changed, 77 insertions(+), 23 deletions(-)
 rename test/ir/{inference => pir/cinn/symbolic}/test_decomp_inference_predictor_run.py (96%)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d52f71573dc44..35ff7eb608b6a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -131,6 +131,7 @@
 #include "paddle/fluid/pir/transforms/params_sync_among_devices_pass.h"
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/pir/transforms/replace_fetch_with_shadow_output_pass.h"
+#include "paddle/fluid/pir/transforms/shape_optimization_pass.h"
 #include "paddle/pir/include/pass/pass_manager.h"
 
 COMMON_DECLARE_bool(enable_pir_in_executor);
@@ -896,12 +897,19 @@ bool AnalysisPredictor::PrepareExecutor() {
       pir_program_ = std::move(
           paddle::TranslateLegacyProgramToProgram(*inference_program_));
 
+#ifdef PADDLE_WITH_CINN
       if (paddle::prim::PrimCommonUtils::IsFwdPrimEnabled()) {
         VLOG(4) << "[Prim] Decomp program in predictor begin.";
         DecompProgram decomp_object(pir_program_.get());
         decomp_object.decomp_program();
+
+        auto shape_pm = std::make_shared<::pir::PassManager>(
+            ::pir::IrContext::Instance(), 2);
+        ::pir::shape::AddShapeOptimizationPass(shape_pm, *pir_program_.get());
+        VLOG(4) << "[ShapeDialect] Run AddShapeOptimizationPass";
+        shape_pm->Run(pir_program_.get());
       }
-#ifdef PADDLE_WITH_CINN
+
       if (config_.cinn_enabled()) {
         VLOG(4) << "[CINN] Begin ApplyCinnPass";
         cinn::dialect::ir::ApplyCinnPass(pir_program_.get(), [&] {
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index 80d56f75ae12b..d9cf96f78efe9 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -13,12 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/transforms/shape_optimization_pass.h"
+#include "paddle/common/flags.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/include/core/dialect.h"
 #include "paddle/pir/include/dialect/shape/ir/shape_attribute.h"
+#include "paddle/pir/include/dialect/shape/ir/shape_dialect.h"
 #include "paddle/pir/include/pass/pass_manager.h"
 #include "paddle/pir/include/pass/pass_registry.h"
 
+COMMON_DECLARE_bool(pir_apply_shape_optimization_pass);
+
 const int vlog_level = 3;
 
 namespace pir {
@@ -155,4 +159,38 @@ std::unique_ptr<Pass> CreateShapeOptimizationPass() {
 
 }  // namespace pir
 
+namespace pir::shape {
+
+bool HasDynamicShape(const pir::Program& program) {
+  for (const auto& op : *program.block()) {
+    if (op.isa<pir::CombineOp>()) {
+      continue;
+    }
+    for (uint32_t i = 0; i < op.num_results(); ++i) {
+      if (op.result(i) && op.result(i).type()) {
+        auto shape_type =
+            op.result(i).type().dyn_cast<pir::ShapedTypeInterface>();
+        if (shape_type && shape_type.IsDynamicShape()) {
+          VLOG(vlog_level) << "###### HasDynamicShape == true";
+          return true;
+        }
+      }
+    }
+  }
+  VLOG(vlog_level) << "###### HasDynamicShape == false";
+  return false;
+}
+
+void AddShapeOptimizationPass(
+    std::shared_ptr<pir::PassManager>& pass_manager,  // NOLINT
+    pir::Program& program) {                          // NOLINT
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
+  if (HasDynamicShape(program) && FLAGS_pir_apply_shape_optimization_pass) {
+    pass_manager->AddPass(pir::CreateShapeOptimizationPass());
+  }
+}
+
+}  // namespace pir::shape
+
 REGISTER_IR_PASS(shape_optimization_pass, pir::ShapeOptimizationPass);
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.h b/paddle/fluid/pir/transforms/shape_optimization_pass.h
index a23de56f35d6e..5050ea727e678 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.h
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include "paddle/pir/include/core/dll_decl.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
+#include "paddle/pir/include/pass/pass_manager.h"
 
 namespace pir {
 
@@ -28,3 +29,12 @@ void InferSymExprForBlock(const Block &block,
                           ShapeConstraintIRAnalysis *shape_analysis);
 
 }  // namespace pir
+
+namespace pir::shape {
+bool HasDynamicShape(const pir::Program &program);
+
+void AddShapeOptimizationPass(
+    std::shared_ptr<pir::PassManager> &pass_manager,  // NOLINT
+    pir::Program &program);                           // NOLINT
+
+}  // namespace pir::shape
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index bd603e326a9ad..45fe7263e692c 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1537,24 +1537,6 @@ void BindUtils(pybind11::module *m) {
 
 namespace {
 
-bool HasDynamicShape(const pir::Program &program) {
-  for (const auto &op : *program.block()) {
-    if (op.isa<pir::CombineOp>()) {
-      continue;
-    }
-    for (uint32_t i = 0; i < op.num_results(); ++i) {
-      if (op.result(i) && op.result(i).type()) {
-        auto shape_type =
-            op.result(i).type().dyn_cast<pir::ShapedTypeInterface>();
-        if (shape_type && shape_type.IsDynamicShape()) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
 void ApplyCinnPass(Program &program) {  // NOLINT
 #ifdef PADDLE_WITH_CINN
   cinn::dialect::ir::ApplyCinnPass(&program, [] {
@@ -1582,7 +1564,8 @@ void InferSymbolicShapePass(
     pir::Program &program) {                          // NOLINT
   pir::IrContext *ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<pir::shape::ShapeDialect>();
-  if (HasDynamicShape(program) && FLAGS_pir_apply_shape_optimization_pass) {
+  if (pir::shape::HasDynamicShape(program) &&
+      FLAGS_pir_apply_shape_optimization_pass) {
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
   }
 }
diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index 665d1a0b0461d..9f26f4dd17269 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -11,6 +11,7 @@ if(WITH_GPU)
     test_if_st.py
     test_if_dy.py
     test_llama_if_dy.py
+    test_decomp_inference_predictor_run.py
     test_sub_graph_for_backend.py
     test_sub_graph_for_frontend.py
     test_check_infer_symbolic.py
@@ -70,6 +71,19 @@ if(WITH_GPU)
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_llama_if_dy PROPERTIES LABELS "RUN_TYPE=CINN")
 
+  add_test(
+    NAME test_decomp_inference_predictor_run
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+      FLAGS_prim_all=true FLAGS_cinn_bucket_compile=false
+      FLAGS_pir_apply_shape_optimization_pass=true
+      FLAGS_prim_enable_dynamic=true ${PYTHON_EXECUTABLE}
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_decomp_inference_predictor_run.py
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_tests_properties(test_decomp_inference_predictor_run
+                       PROPERTIES LABELS "RUN_TYPE=CINN")
+
   add_test(
     NAME test_cinn_reduce_symbolic_demo
     COMMAND
diff --git a/test/ir/inference/test_decomp_inference_predictor_run.py b/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py
similarity index 96%
rename from test/ir/inference/test_decomp_inference_predictor_run.py
rename to test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py
index 687f28c1bcf15..0a9c091f05ee7 100644
--- a/test/ir/inference/test_decomp_inference_predictor_run.py
+++ b/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py
@@ -32,8 +32,7 @@ def forward(self, x1, x2):
         y1 = self.fc1(x1)
         y2 = self.fc2(x2)
         y3 = y1 + y2
-        y4 = paddle.nn.functional.layer_norm(y3, y3.shape[1:])
-        z = paddle.nn.functional.softmax(y4)
+        z = paddle.nn.functional.softmax(y3)
         return z
 
 
@@ -50,7 +49,9 @@ def setUp(self):
             net,
             input_spec=[
                 paddle.static.InputSpec(
-                    shape=self.shape, dtype='float32', name='input0'
+                    shape=[None, None, None, None],
+                    dtype='float32',
+                    name='input0',
                 ),
                 paddle.static.InputSpec(
                     shape=self.shape, dtype='float32', name='input1'

From 754079f9df70864300458e4bfb5e33c50d9cc527 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Fri, 1 Mar 2024 09:49:35 +0800
Subject: [PATCH 35/55] [PIR] Add missing assign for divide with scalar
 (#62252)

---
 python/paddle/pir/math_op_patch.py         |  2 +-
 test/legacy_test/test_math_op_patch_pir.py | 26 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/python/paddle/pir/math_op_patch.py b/python/paddle/pir/math_op_patch.py
index a14e8e8c9b90b..925c5b805c9fa 100644
--- a/python/paddle/pir/math_op_patch.py
+++ b/python/paddle/pir/math_op_patch.py
@@ -338,7 +338,7 @@ def __impl__(self, other_var):
                     python_api == paddle.divide
                     and self.dtype in _supported_int_dtype_
                 ):
-                    paddle.cast(self, DataType.FLOAT32)
+                    self = paddle.cast(self, DataType.FLOAT32)
                 # here use `scale` replace `elementwise` to get better performance
                 # but only +, -, *, / can use this method
                 if scalar_method is not None:
diff --git a/test/legacy_test/test_math_op_patch_pir.py b/test/legacy_test/test_math_op_patch_pir.py
index 8862882d89985..12bcebbb3b5f0 100644
--- a/test/legacy_test/test_math_op_patch_pir.py
+++ b/test/legacy_test/test_math_op_patch_pir.py
@@ -643,6 +643,32 @@ def test_math_exists(self):
             self.assertTrue(inspect.ismethod(a.asinh_))
             self.assertTrue(inspect.ismethod(a.diag))
 
+    def test_binary_op_with_scalar(self):
+        with paddle.pir_utils.IrGuard():
+            main_program, exe, program_guard = new_program()
+            with program_guard:
+                x_np = np.array(10, dtype=np.int32)
+                x = paddle.static.data(name='x', shape=[], dtype="int32")
+                y1 = x / 2
+                y2 = x / 5.0
+                y3 = x // 2
+                y4 = x * 8.0
+                self.assertEqual(y1.dtype, paddle.pir.core.DataType.FLOAT32)
+                self.assertEqual(y2.dtype, paddle.pir.core.DataType.FLOAT32)
+                self.assertEqual(y3.dtype, paddle.pir.core.DataType.INT32)
+                self.assertEqual(y4.dtype, paddle.pir.core.DataType.FLOAT32)
+                (y1_out, y2_out, y3_out, y4_out) = exe.run(
+                    main_program,
+                    feed={
+                        "x": x_np,
+                    },
+                    fetch_list=[y1, y2, y3, y4],
+                )
+                np.testing.assert_allclose(x_np / 2, y1_out, rtol=1e-05)
+                np.testing.assert_allclose(x_np / 5.0, y2_out, rtol=1e-05)
+                np.testing.assert_allclose(x_np // 2, y3_out, atol=1e-05)
+                np.testing.assert_allclose(x_np * 8.0, y4_out, rtol=1e-05)
+
 
 if __name__ == '__main__':
     unittest.main()

From d7f26ef4a51175531c31007c596f5abed1327369 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 1 Mar 2024 09:53:29 +0800
Subject: [PATCH 36/55] pir onednn sgd (#62244)

---
 paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index 283761ec09903..c76336addc9dc 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -238,9 +238,7 @@
 
 - op : scale
 
-- op : sgd
-
-# - op : sgd_dense_param_sparse_grad
+- op : sgd_
 
 - op : shape
   extra_args : str mkldnn_data_type="float32"

From ebc27f54db86b70196758c519aea5418674e691c Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 1 Mar 2024 10:10:08 +0800
Subject: [PATCH 37/55] [PIR] pir onednn support split (#62238)

* pir onednn support split
---
 .../ir_adaptor/translator/op_translator.cc     | 18 +++++++++++++++---
 .../dialect/operator/ir/ops_onednn_extra.yaml  |  5 +++--
 test/mkldnn/test_split_bf16_mkldnn_op.py       |  2 +-
 test/mkldnn/test_split_mkldnn_op.py            | 14 +++++++++++---
 4 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 1c75d198ef07d..c4ad629fc3d91 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1255,6 +1255,16 @@ struct SplitOpTranscriber : public OpTranscriber {
 
       return attribute_map;
     }
+#ifdef PADDLE_WITH_DNNL
+    else if (op_desc.HasAttr("mkldnn_data_type")) {  // NOLINT
+      pir::AttributeMap attribute_map = {
+          {"mkldnn_data_type",
+           pir::StrAttribute::get(
+               ctx, op_desc.GetAttrIfExists<std::string>("mkldnn_data_type"))},
+      };
+      return attribute_map;
+    }
+#endif
 
     return {};
   }
@@ -1262,17 +1272,19 @@ struct SplitOpTranscriber : public OpTranscriber {
   pir::OpInfo LookUpOpInfo(pir::IrContext* ctx,
                            const OpDesc& op_desc) override {
     int num = paddle::get<int>(op_desc.GetAttr("num"));
+    auto prefix = GetPrefix(ctx, op_desc);
     std::string target_op_name;
     if (num > 0) {
-      target_op_name = "pd_op.split_with_num";
+      target_op_name = prefix + "split_with_num";
 
     } else {
-      target_op_name = "pd_op.split";
+      target_op_name = prefix + "split";
     }
 
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
-      IR_THROW("Op assign_value should have corresponding OpInfo pd_op.split");
+      IR_THROW("Op assign_value should have corresponding OpInfo %s.",
+               target_op_name);
     }
 
     return op_info;
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index c76336addc9dc..af136f8a518b5 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -265,9 +265,10 @@
 
 - op : softplus
 
-# - op : split
+- op : split
+  extra_args : str mkldnn_data_type="float32"
 
-# - op : split_with_num
+- op : split_with_num
 
 - op : sqrt
 
diff --git a/test/mkldnn/test_split_bf16_mkldnn_op.py b/test/mkldnn/test_split_bf16_mkldnn_op.py
index 6e8b1b56ebc07..c9297de55fae5 100644
--- a/test/mkldnn/test_split_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_split_bf16_mkldnn_op.py
@@ -64,7 +64,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
 
 
 class TestSplitNumBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
diff --git a/test/mkldnn/test_split_mkldnn_op.py b/test/mkldnn/test_split_mkldnn_op.py
index 15a24c3b4861f..14e39ab0c01fd 100644
--- a/test/mkldnn/test_split_mkldnn_op.py
+++ b/test/mkldnn/test_split_mkldnn_op.py
@@ -68,10 +68,15 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output(check_dygraph=False, check_pir_onednn=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['out0', 'out1', 'out2'], check_dygraph=False)
+        self.check_grad(
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_dygraph=False,
+            check_pir_onednn=True,
+        )
 
 
 # test with attr(num)
@@ -87,7 +92,10 @@ def init_test_case(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2', 'out3'], check_dygraph=False
+            ['X'],
+            ['out0', 'out1', 'out2', 'out3'],
+            check_dygraph=False,
+            check_pir_onednn=True,
         )
 
 

From 3ce483b52ef4c696dccd9534ccc91998432101de Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 1 Mar 2024 10:10:24 +0800
Subject: [PATCH 38/55] [PIR] add distributed dialect. (#61978)

---
 paddle/fluid/pir/dialect/CMakeLists.txt       |   6 +
 .../distributed/ir/attribute_storage.h        | 118 ++++++++++++++++
 .../dialect/distributed/ir/dist_attribute.cc  |  73 ++++++++++
 .../dialect/distributed/ir/dist_attribute.h   | 101 ++++++++++++++
 .../dialect/distributed/ir/dist_dialect.cc    |  62 +++++++++
 .../pir/dialect/distributed/ir/dist_dialect.h |  41 ++++++
 .../pir/dialect/distributed/ir/dist_type.cc   |  43 ++++++
 .../pir/dialect/distributed/ir/dist_type.h    |  61 +++++++++
 .../pir/dialect/distributed/ir/type_storage.h |  81 +++++++++++
 paddle/fluid/pybind/pybind.cc                 |   3 +
 paddle/pir/include/core/attribute.h           |   7 +-
 paddle/pir/include/core/attribute_base.h      |  12 +-
 paddle/pir/include/core/storage_manager.h     |   2 +-
 .../include/core/storage_manager_support.h    |   8 +-
 paddle/pir/include/core/type.h                |   8 +-
 test/cpp/pir/CMakeLists.txt                   |   1 +
 test/cpp/pir/distributed/CMakeLists.txt       |   3 +
 test/cpp/pir/distributed/dist_dialect_test.cc | 127 ++++++++++++++++++
 18 files changed, 743 insertions(+), 14 deletions(-)
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_type.h
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/type_storage.h
 create mode 100644 test/cpp/pir/distributed/CMakeLists.txt
 create mode 100644 test/cpp/pir/distributed/dist_dialect_test.cc

diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index 2955a6d57afb5..d5050b49ac582 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -255,6 +255,12 @@ if(WITH_MKLDNN)
       ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/manual_onednn_op.cc)
 endif()
 
+file(GLOB_RECURSE dist_dialect_srcs
+     "${CMAKE_CURRENT_SOURCE_DIR}/distributed/ir/*.cc")
+
+if(WITH_DISTRIBUTE)
+  set(op_dialect_srcs ${op_dialect_srcs} ${dist_dialect_srcs})
+endif()
 set(op_dialect_deps phi common pir type_info string_helper)
 
 cc_library(
diff --git a/paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h b/paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h
new file mode 100644
index 0000000000000..f572e5dae762b
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/common/ddim.h"
+#include "paddle/common/hash_funcs.h"
+#include "paddle/common/layout.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/phi/common/reduce_type.h"
+#include "paddle/pir/include/core/attribute_base.h"
+#include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/utils.h"
+#include "paddle/utils/flat_hash_map.h"
+
+namespace paddle {
+namespace dialect {
+
+struct ProcessMeshAttrStorage : public pir::AttributeStorage {
+  ///
+  /// \brief Declare ParamKey according to parameter type.
+  ///
+  using ParamKey = phi::distributed::ProcessMesh;
+
+  ProcessMeshAttrStorage(ParamKey&& process_mesh)  // NOLINT
+      : process_mesh(std::move(process_mesh)) {}
+
+  ///
+  /// \brief Each derived TypeStorage must define a Construct method, which
+  /// StorageManager uses to construct a derived TypeStorage.
+  ///
+  static ProcessMeshAttrStorage* Construct(ParamKey&& key) {
+    return new ProcessMeshAttrStorage(std::move(key));
+  }
+
+  ///
+  /// \brief Each derived TypeStorage must provide a HashValue method.
+  ///
+  static std::size_t HashValue(const ParamKey& key) { return key.hash(); }
+
+  ///
+  /// \brief Each derived TypeStorage needs to overload operator==.
+  ///
+  bool operator==(const ParamKey& key) const {
+    return process_mesh == key && process_mesh.dim_names() == key.dim_names();
+  }
+
+  ParamKey process_mesh;
+};
+
+struct TensorDistAttrStorage : public pir::AttributeStorage {
+  ///
+  /// \brief Declare ParamKey according to parameter type.
+  ///
+  using ParamKey = std::tuple<ProcessMeshAttribute,
+                              std::vector<int64_t>,
+                              flat_hash_map<int64_t, phi::ReduceType>>;
+
+  TensorDistAttrStorage(ParamKey&& param)  // NOLINT
+      : process_mesh(std::get<0>(param)),
+        dims_mapping(std::move(std::get<1>(param))),
+        partial_status(std::move(std::get<2>(param))) {}
+  ///
+  /// \brief Each derived TypeStorage must define a Construct method, which
+  /// StorageManager uses to construct a derived TypeStorage.
+  ///
+  static TensorDistAttrStorage* Construct(ParamKey&& key) {
+    return new TensorDistAttrStorage(std::move(key));
+  }
+
+  ///
+  /// \brief Each derived TypeStorage must provide a HashValue method.
+  ///
+  static std::size_t HashValue(const ParamKey& key) {
+    auto mesh_hash = std::get<0>(key).hash();
+    auto dims_map_hash = std::hash<std::vector<int64_t>>()(std::get<1>(key));
+    std::string partial_status_str = "[";
+    for (auto& itr : std::get<2>(key)) {
+      partial_status_str +=
+          "Partial(dims:" + std::to_string(itr.first) + ", " +
+          phi::ReduceTypeStrings[static_cast<int>(itr.second)] + "), ";
+    }
+    partial_status_str += "]";
+    auto combine_hash = pir::detail::hash_combine(mesh_hash, dims_map_hash);
+    return pir::detail::hash_combine(
+        combine_hash, std::hash<std::string>()(partial_status_str));
+  }
+
+  ///
+  /// \brief Each derived TypeStorage needs to overload operator==.
+  ///
+  bool operator==(const ParamKey& key) const {
+    return process_mesh == std::get<0>(key) &&
+           dims_mapping == std::get<1>(key) &&
+           partial_status == std::get<2>(key);
+  }
+
+  ProcessMeshAttribute process_mesh;
+  std::vector<int64_t> dims_mapping;
+  // partial map would less or equal than to mesh.size.
+  // iterate operation (copy and comparison) would more frequency than random
+  // element access. <key: dim on mesh, value: reduce type>
+  flat_hash_map<int64_t, phi::ReduceType> partial_status;
+};
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc
new file mode 100644
index 0000000000000..372d6206c2be8
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h"
+namespace paddle {
+namespace dialect {
+///
+/// \brief ProcessMeshAttribute interface.
+///
+const phi::distributed::ProcessMesh& ProcessMeshAttribute::process_mesh()
+    const {
+  return storage()->process_mesh;
+}
+ProcessMeshAttribute ProcessMeshAttribute::get(
+    pir::IrContext* ctx, const phi::distributed::ProcessMesh& mesh) {
+  return Base::get(ctx, mesh);
+}
+ProcessMeshAttribute ProcessMeshAttribute::get(
+    pir::IrContext* ctx,
+    const std::vector<int64_t>& shape,
+    const std::vector<int64_t>& process_ids,
+    const std::vector<std::string>& dim_names) {
+  return Base::get(ctx, shape, process_ids, dim_names);
+}
+
+///
+/// \brief TensorDistAttribute interface.
+///
+ProcessMeshAttribute TensorDistAttribute::mesh_attr() const {
+  return storage()->process_mesh;
+}
+const std::vector<int64_t>& TensorDistAttribute::dims_mapping() const {
+  return storage()->dims_mapping;
+}
+
+std::set<int64_t> TensorDistAttribute::partial_dims() const {
+  auto& partial = partial_status();
+  std::set<int64_t> keys;
+  for (auto& kv : partial) {
+    keys.emplace(kv.first);
+  }
+  return keys;
+}
+
+const flat_hash_map<int64_t, phi::ReduceType>&
+TensorDistAttribute::partial_status() const {
+  return storage()->partial_status;
+}
+
+TensorDistAttribute TensorDistAttribute::get(
+    pir::IrContext* ctx,
+    ProcessMeshAttribute mesh,
+    const std::vector<int64_t>& dims_mapping,
+    const flat_hash_map<int64_t, phi::ReduceType>& partial_status) {
+  return Base::get(ctx, mesh, dims_mapping, partial_status);
+}
+
+}  // namespace dialect
+}  // namespace paddle
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ProcessMeshAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::TensorDistAttribute)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h
new file mode 100644
index 0000000000000..1ee05404a3df9
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/reduce_type.h"
+#include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
+#include "paddle/pir/include/core/attribute.h"
+#include "paddle/pir/include/core/builtin_attribute_storage.h"
+#include "paddle/pir/include/core/utils.h"
+#include "paddle/utils/flat_hash_map.h"
+
+namespace paddle {
+namespace dialect {
+class ProcessMeshAttrStorage;
+class TensorDistAttrStorage;
+
+class ProcessMeshAttribute : public pir::AttrBase<ProcessMeshAttribute,
+                                                  pir::Attribute,
+                                                  ProcessMeshAttrStorage> {
+ public:
+  using Base::Base;
+  const phi::distributed::ProcessMesh& process_mesh() const;
+  const std::vector<int64_t>& shape() const { return process_mesh().shape(); }
+  const std::vector<int64_t>& process_ids() const {
+    return process_mesh().process_ids();
+  }
+  const std::vector<std::string>& dim_names() const {
+    return process_mesh().dim_names();
+  }
+  int64_t size() const { return process_mesh().size(); }
+  int64_t ndim() const { return process_mesh().ndim(); }
+  int64_t dim_size(int64_t dim) const { return process_mesh().dim_size(dim); }
+  int64_t dim_size(const std::string& dim_name) const {
+    return process_mesh().dim_size(dim_name);
+  }
+  bool empty() const { return process_mesh().empty(); }
+  bool contains(int64_t process_id) const {
+    return process_mesh().contains(process_id);
+  }
+  size_t hash() const { return process_mesh().hash(); }
+
+  std::string to_string() const { return process_mesh().to_string(); }
+
+  static ProcessMeshAttribute get(pir::IrContext* ctx,
+                                  const phi::distributed::ProcessMesh& mesh);
+  static ProcessMeshAttribute get(pir::IrContext* ctx,
+                                  const std::vector<int64_t>& shape,
+                                  const std::vector<int64_t>& process_ids,
+                                  const std::vector<std::string>& dim_names);
+};
+
+class TensorDistAttribute : public pir::AttrBase<TensorDistAttribute,
+                                                 pir::Attribute,
+                                                 TensorDistAttrStorage> {
+ public:
+  using Base::Base;
+  ProcessMeshAttribute mesh_attr() const;
+  const phi::distributed::ProcessMesh& process_mesh() const {
+    return mesh_attr().process_mesh();
+  }
+  const std::vector<int64_t>& dims_mapping() const;
+
+  // return vector of mesh dims on which the this tensor is partial on
+  std::set<int64_t> partial_dims() const;
+
+  const flat_hash_map<int64_t, phi::ReduceType>& partial_status() const;
+
+  static TensorDistAttribute get(
+      pir::IrContext* ctx,
+      ProcessMeshAttribute mesh,
+      const std::vector<int64_t>& dims_mapping,
+      const flat_hash_map<int64_t, phi::ReduceType>& partial_status);
+  static TensorDistAttribute get(
+      pir::IrContext* ctx,
+      const phi::distributed::ProcessMesh& mesh,
+      const std::vector<int64_t>& dims_mapping,
+      const flat_hash_map<int64_t, phi::ReduceType>& partial_status) {
+    return get(ctx,
+               ProcessMeshAttribute::get(ctx, mesh),
+               dims_mapping,
+               partial_status);
+  }
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ProcessMeshAttribute)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::TensorDistAttribute)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
new file mode 100644
index 0000000000000..5329c0086d742
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/type_storage.h"
+
+REGISTER_FILE_SYMBOLS(dist_dialect);
+namespace paddle {
+namespace dialect {
+
+DistDialect::DistDialect(pir::IrContext *context)
+    : pir::Dialect(name(), context, pir::TypeId::get<DistDialect>()) {
+  initialize();
+}
+
+void DistDialect::initialize() {
+  RegisterAttributes<ProcessMeshAttribute, TensorDistAttribute>();
+  RegisterTypes<DistDenseTensorType>();
+}
+
+void DistDialect::PrintType(pir::Type type, std::ostream &os) const {
+  if (auto dist_dense_tensor_type = type.dyn_cast<DistDenseTensorType>()) {
+    // Todo: Design the dist dense tensor type print format.
+    os << dist_dense_tensor_type.dense_tensor_type();
+  } else {
+    os << "error_type!";
+  }
+}
+
+void DistDialect::PrintAttribute(pir::Attribute attr, std::ostream &os) const {
+  if (auto process_mesh_attr = attr.dyn_cast<ProcessMeshAttribute>()) {
+    os << process_mesh_attr.process_mesh();
+  } else if (auto tensor_dist_attr = attr.dyn_cast<TensorDistAttribute>()) {
+    // Todo: Design the tensor dist attr print format.
+    os << tensor_dist_attr.process_mesh();
+  } else {
+    os << "error_attribute_type";
+  }
+}
+
+pir::OpPrintFn DistDialect::PrintOperation(pir::Operation *op) const {
+  return nullptr;
+}
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::DistDialect)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h
new file mode 100644
index 0000000000000..2a7420b0a495a
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/include/core/dialect.h"
+
+namespace paddle {
+namespace dialect {
+
+class DistDialect : public pir::Dialect {
+ public:
+  explicit DistDialect(pir::IrContext* context);
+
+  static const char* name() { return "pd_dist"; }
+
+  void PrintType(pir::Type type, std::ostream& os) const override;
+
+  void PrintAttribute(pir::Attribute attr, std::ostream& os) const override;
+
+  pir::OpPrintFn PrintOperation(pir::Operation* op) const override;  // NOLINT
+
+ private:
+  void initialize();
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::DistDialect)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
new file mode 100644
index 0000000000000..94a2d85fbcdd7
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/type_storage.h"
+
+namespace paddle {
+namespace dialect {
+
+pir::DenseTensorType DistDenseTensorType::dense_tensor_type() const {
+  return storage()->dense_tensor_type;
+}
+
+TensorDistAttribute DistDenseTensorType::tensor_dist_attr() const {
+  return storage()->tensor_dist_attr;
+}
+
+const common::DDim& DistDenseTensorType::global_ddim() const {
+  return storage()->global_ddim;
+}
+
+DistDenseTensorType DistDenseTensorType::get(
+    pir::IrContext* ctx,
+    pir::DenseTensorType dense_tensor_type,
+    TensorDistAttribute tensor_dist_attr,
+    const common::DDim& global_ddim) {
+  return Base::get(ctx, dense_tensor_type, tensor_dist_attr, global_ddim);
+}
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::DistDenseTensorType)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.h b/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
new file mode 100644
index 0000000000000..4aa08169440cc
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/type.h"
+
+namespace paddle {
+namespace dialect {
+
+class DistDenseTensorTypeStorage;
+
+class DistDenseTensorType
+    : public pir::Type::
+          TypeBase<DistDenseTensorType, pir::Type, DistDenseTensorTypeStorage> {
+ public:
+  using Base::Base;
+
+  pir::DenseTensorType dense_tensor_type() const;
+  TensorDistAttribute tensor_dist_attr() const;
+  const common::DDim& global_ddim() const;
+  const common::DDim& local_ddim() const { return dense_tensor_type().dims(); }
+  Type dtype() const { return dense_tensor_type().dtype(); }
+  DataLayout data_layout() const { return dense_tensor_type().data_layout(); }
+
+  const phi::distributed::ProcessMesh& process_mesh() const {
+    return tensor_dist_attr().process_mesh();
+  }
+  const std::vector<int64_t>& dims_mapping() const {
+    return tensor_dist_attr().dims_mapping();
+  }
+  std::set<int64_t> partial_dims() const {
+    return tensor_dist_attr().partial_dims();
+  }
+  const flat_hash_map<int64_t, phi::ReduceType>& partial_status() const {
+    return tensor_dist_attr().partial_status();
+  }
+
+  static DistDenseTensorType get(pir::IrContext* ctx,
+                                 pir::DenseTensorType dense_tensor_type,
+                                 TensorDistAttribute tensor_dist_attr,
+                                 const common::DDim& global_ddim);
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::DistDenseTensorType)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/type_storage.h b/paddle/fluid/pir/dialect/distributed/ir/type_storage.h
new file mode 100644
index 0000000000000..1f18573d3e162
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/type_storage.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/pir/include/core/builtin_type.h"
+
+namespace paddle {
+namespace dialect {
+///
+/// \brief Define Parametric TypeStorage for DistDenseTensorType.
+///
+struct DistDenseTensorTypeStorage : public pir::TypeStorage {
+  ///
+  /// \brief Declare ParamKey according to parameter type.
+  ///
+  using ParamKey =
+      std::tuple<pir::DenseTensorType, TensorDistAttribute, common::DDim>;
+
+  DistDenseTensorTypeStorage(pir::DenseTensorType dense_tensor_type,
+                             TensorDistAttribute tensor_dist_attr,
+                             const common::DDim& global_ddim)
+      : dense_tensor_type(dense_tensor_type),
+        tensor_dist_attr(tensor_dist_attr),
+        global_ddim(global_ddim) {}
+
+  ///
+  /// \brief Each derived TypeStorage must define a Construct method, which
+  /// StorageManager uses to construct a derived TypeStorage.
+  ///
+  static DistDenseTensorTypeStorage* Construct(ParamKey&& key) {
+    return new DistDenseTensorTypeStorage(
+        std::get<0>(key), std::get<1>(key), std::get<2>(key));
+  }
+
+  ///
+  /// \brief Each derived TypeStorage must provide a HashValue method.
+  ///
+  static std::size_t HashValue(const ParamKey& key) {
+    auto dense_tensor_type_hash = std::hash<pir::Type>()(std::get<0>(key));
+    auto tensor_dist_attr_hash = std::hash<pir::Attribute>()(std::get<1>(key));
+    auto global_ddim_hash = std::hash<common::DDim>()(std::get<2>(key));
+    auto value = pir::detail::hash_combine(dense_tensor_type_hash,
+                                           tensor_dist_attr_hash);
+    return pir::detail::hash_combine(value, global_ddim_hash);
+  }
+
+  ///
+  /// \brief Each derived TypeStorage needs to overload operator==.
+  ///
+  bool operator==(const ParamKey& key) const {
+    return dense_tensor_type == std::get<0>(key) &&
+           tensor_dist_attr == std::get<1>(key) &&
+           global_ddim == std::get<2>(key);
+  }
+
+  ///
+  /// \brief DistDenseTensorTypeStorage include three parameters:
+  /// dense_tensor_type, tensor_dist_attr and global_ddim;
+  ///
+  pir::DenseTensorType dense_tensor_type;
+  TensorDistAttribute tensor_dist_attr;
+  common::DDim global_ddim;
+};
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f1d53f3f88750..ffaef54bb9da9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -223,6 +223,9 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 
 DECLARE_FILE_SYMBOLS(init_phi);
 DECLARE_FILE_SYMBOLS(kernel_dialect);
+#ifdef PADDLE_WITH_DISTRIBUTE
+DECLARE_FILE_SYMBOLS(dist_dialect);
+#endif
 DECLARE_FILE_SYMBOLS(buffered_allocator);
 DECLARE_FILE_SYMBOLS(best_fit_allocator);
 DECLARE_FILE_SYMBOLS(aligned_allocator);
diff --git a/paddle/pir/include/core/attribute.h b/paddle/pir/include/core/attribute.h
index 9571440679b8c..2c1ca17656811 100644
--- a/paddle/pir/include/core/attribute.h
+++ b/paddle/pir/include/core/attribute.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/pir/include/core/cast_utils.h"
+#include "paddle/pir/include/core/storage_manager_support.h"
 #include "paddle/pir/include/core/type_id.h"
 
 constexpr char kAttrStopGradients[] = "stop_gradient";
@@ -87,6 +88,8 @@ class IR_API Attribute {
     return pir::dyn_cast<U>(*this);
   }
 
+  std::size_t hash() const { return std::hash<const void *>()(storage_); }
+
  protected:
   const Storage *storage_{nullptr};
 };
@@ -97,8 +100,6 @@ IR_API std::ostream &operator<<(std::ostream &os, Attribute attr);
 namespace std {
 template <>
 struct hash<pir::Attribute> {
-  std::size_t operator()(const pir::Attribute &obj) const {
-    return std::hash<const void *>()(obj);
-  }
+  std::size_t operator()(const pir::Attribute &obj) const { return obj.hash(); }
 };
 }  // namespace std
diff --git a/paddle/pir/include/core/attribute_base.h b/paddle/pir/include/core/attribute_base.h
index d6c75f2e5d8ce..0f459f23e9f99 100644
--- a/paddle/pir/include/core/attribute_base.h
+++ b/paddle/pir/include/core/attribute_base.h
@@ -16,8 +16,8 @@
 
 #include "paddle/pir/include/core/ir_context.h"
 #include "paddle/pir/include/core/storage_manager.h"
+#include "paddle/pir/include/core/storage_manager_support.h"
 #include "paddle/pir/include/core/type_id.h"
-
 namespace pir {
 class Dialect;
 
@@ -239,6 +239,16 @@ struct IR_API AttributeManager {
   }
 };
 
+template <typename ConcreteType,
+          typename BaseType,
+          typename StorageType,
+          class... TraitOrInterface>
+using AttrBase = detail::StorageHelperBase<ConcreteType,
+                                           BaseType,
+                                           StorageType,
+                                           AttributeManager,
+                                           TraitOrInterface...>;
+
 ///
 /// \brief Add some necessary functions to the custom Attribute class.
 ///
diff --git a/paddle/pir/include/core/storage_manager.h b/paddle/pir/include/core/storage_manager.h
index 8cacc3bd38bd0..7024e580e4a1f 100644
--- a/paddle/pir/include/core/storage_manager.h
+++ b/paddle/pir/include/core/storage_manager.h
@@ -74,7 +74,7 @@ class IR_API StorageManager {
       return static_cast<const Storage &>(*existing) == param;
     };
     auto constructor = [&]() {
-      auto *storage = Storage::Construct(param);
+      auto *storage = Storage::Construct(std::move(param));
       if (init_func) init_func(storage);
       return storage;
     };
diff --git a/paddle/pir/include/core/storage_manager_support.h b/paddle/pir/include/core/storage_manager_support.h
index 7d4d540382dcd..b729a4480ac35 100644
--- a/paddle/pir/include/core/storage_manager_support.h
+++ b/paddle/pir/include/core/storage_manager_support.h
@@ -18,8 +18,6 @@
 
 #include "paddle/pir/include/core/interface_support.h"
 #include "paddle/pir/include/core/ir_context.h"
-#include "paddle/pir/include/core/type.h"
-#include "paddle/pir/include/core/type_base.h"
 #include "paddle/pir/include/core/type_id.h"
 
 namespace pir {
@@ -68,7 +66,7 @@ class StorageHelperBase : public BaseT {
       typename Filter<TypeInterfaceBase, std::tuple<TraitOrInterface...>>::Type;
 
   static ConcreteT dyn_cast_impl(BaseT type) {
-    if (type && type.abstract_type().type_id() == TypeId::get<ConcreteT>()) {
+    if (type && type.type_id() == TypeId::get<ConcreteT>()) {
       return ConcreteT(type.storage());
     }
     return ConcreteT(nullptr);
@@ -107,8 +105,8 @@ class StorageHelperBase : public BaseT {
   /// \brief Get or create a new ConcreteT instance within the ctx.
   ///
   template <typename... Args>
-  static ConcreteT get(pir::IrContext *ctx, Args... args) {
-    return ManagerT::template get<ConcreteT>(ctx, args...);
+  static ConcreteT get(pir::IrContext *ctx, Args &&...args) {
+    return ManagerT::template get<ConcreteT>(ctx, std::forward<Args>(args)...);
   }
 
   ///
diff --git a/paddle/pir/include/core/type.h b/paddle/pir/include/core/type.h
index 569b356135b18..fcfe0a77a8ac5 100644
--- a/paddle/pir/include/core/type.h
+++ b/paddle/pir/include/core/type.h
@@ -18,6 +18,7 @@
 
 #include "paddle/pir/include/core/cast_utils.h"
 #include "paddle/pir/include/core/storage_manager_support.h"
+#include "paddle/pir/include/core/type_base.h"
 #include "paddle/pir/include/core/type_id.h"
 
 namespace pir {
@@ -42,7 +43,6 @@ class IR_API Type {
                                              StorageType,
                                              TypeManager,
                                              TraitOrInterface...>;
-
   using Storage = TypeStorage;
   using AbstractT = AbstractType;
 
@@ -125,6 +125,8 @@ class IR_API Type {
   bool IsIntOrIndex() const;
   bool IsIndex() const;
 
+  std::size_t hash() const { return std::hash<const void *>()(storage_); }
+
  protected:
   const Storage *storage_{nullptr};
 
@@ -184,8 +186,6 @@ namespace std {
 ///
 template <>
 struct hash<pir::Type> {
-  std::size_t operator()(const pir::Type &obj) const {
-    return std::hash<const void *>()(obj);
-  }
+  std::size_t operator()(const pir::Type &obj) const { return obj.hash(); }
 };
 }  // namespace std
diff --git a/test/cpp/pir/CMakeLists.txt b/test/cpp/pir/CMakeLists.txt
index 420ffa8b6dc5a..e7de653656897 100644
--- a/test/cpp/pir/CMakeLists.txt
+++ b/test/cpp/pir/CMakeLists.txt
@@ -7,3 +7,4 @@ add_subdirectory(cinn)
 add_subdirectory(control_flow_dialect)
 add_subdirectory(shape_dialect)
 add_subdirectory(sub_graph)
+add_subdirectory(distributed)
diff --git a/test/cpp/pir/distributed/CMakeLists.txt b/test/cpp/pir/distributed/CMakeLists.txt
new file mode 100644
index 0000000000000..0483dbe1fdac0
--- /dev/null
+++ b/test/cpp/pir/distributed/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_DISTRIBUTE)
+  paddle_test(dist_dialect_test SRCS dist_dialect_test.cc)
+endif()
diff --git a/test/cpp/pir/distributed/dist_dialect_test.cc b/test/cpp/pir/distributed/dist_dialect_test.cc
new file mode 100644
index 0000000000000..01dcb2f1010d5
--- /dev/null
+++ b/test/cpp/pir/distributed/dist_dialect_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <iostream>
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/pir/include/core/builtin_type.h"
+
+using namespace paddle::dialect;  // NOLINT
+
+TEST(process_mesh_test, base) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+  std::vector<int64_t> mesh_shape = {2, 2};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3};
+  std::vector<std::string> dim_names = {"x", "y"};
+  std::vector<std::string> dim_names_2 = {"x", "s"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+
+  // construct a ProcessMeshAttribute.
+  auto mesh_attr =
+      ProcessMeshAttribute::get(ctx, mesh_shape, process_ids, dim_names);
+  auto mesh_attr_1 = ProcessMeshAttribute::get(ctx, process_mesh);
+  auto mesh_attr_2 =
+      ProcessMeshAttribute::get(ctx, mesh_shape, process_ids, dim_names_2);
+  EXPECT_EQ(mesh_attr, mesh_attr_1);
+  EXPECT_NE(mesh_attr, mesh_attr_2);
+
+  // test member function.
+  EXPECT_EQ(mesh_attr.process_mesh(), process_mesh);
+  EXPECT_EQ(mesh_attr.shape(), mesh_shape);
+  EXPECT_EQ(mesh_attr.process_ids(), process_ids);
+  EXPECT_EQ(mesh_attr.dim_names(), dim_names);
+  EXPECT_EQ(mesh_attr.size(), 4);
+  EXPECT_EQ(mesh_attr.ndim(), 2);
+  EXPECT_EQ(mesh_attr.dim_size(0), 2);
+  EXPECT_EQ(mesh_attr.dim_size("y"), 2);
+  EXPECT_FALSE(mesh_attr.empty());
+  EXPECT_TRUE(mesh_attr.contains(3));
+  EXPECT_EQ(mesh_attr.hash(), process_mesh.hash());
+  EXPECT_EQ(mesh_attr.to_string(), process_mesh.to_string());
+}
+TEST(tensor_dist_attr_test, base) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+  std::vector<int64_t> dims_mapping = {0, -1};
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status,
+      partial_status_1{{1, phi::ReduceType::kRedSum}};
+
+  auto mesh_attr =
+      ProcessMeshAttribute::get(ctx, mesh_shape, process_ids, dim_names);
+
+  // construct a TensorDistAttribute.
+  auto tensor_dist_attr =
+      TensorDistAttribute::get(ctx, mesh_attr, dims_mapping, partial_status);
+  auto tensor_dist_attr_1 =
+      TensorDistAttribute::get(ctx, process_mesh, dims_mapping, partial_status);
+  auto tensor_dist_attr_2 = TensorDistAttribute::get(
+      ctx, process_mesh, dims_mapping, partial_status_1);
+  EXPECT_EQ(tensor_dist_attr, tensor_dist_attr_1);
+  EXPECT_NE(tensor_dist_attr, tensor_dist_attr_2);
+
+  // test member function.
+  EXPECT_EQ(tensor_dist_attr.mesh_attr(), mesh_attr);
+  EXPECT_EQ(tensor_dist_attr.process_mesh(), process_mesh);
+  EXPECT_EQ(tensor_dist_attr.dims_mapping(), dims_mapping);
+  EXPECT_EQ(tensor_dist_attr.partial_status(), partial_status);
+}
+
+TEST(dist_dense_tensor_type_test, base) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+  auto mesh_attr = ProcessMeshAttribute::get(ctx, process_mesh);
+
+  std::vector<int64_t> dims_mapping = {0, -1};
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status{
+      {1, phi::ReduceType::kRedSum}};
+  // construct a TensorDistAttribute.
+  auto tensor_dist_attr =
+      TensorDistAttribute::get(ctx, mesh_attr, dims_mapping, partial_status);
+
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  common::DDim dims = {2, 2};
+  common::DataLayout data_layout = common::DataLayout::NCHW;
+  pir::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+  pir::DenseTensorType dense_tensor_type = pir::DenseTensorType::get(
+      ctx, fp32_dtype, dims, data_layout, lod, offset);
+
+  auto dist_densor_type =
+      DistDenseTensorType::get(ctx, dense_tensor_type, tensor_dist_attr, dims);
+
+  EXPECT_EQ(dist_densor_type.process_mesh(), process_mesh);
+  EXPECT_EQ(dist_densor_type.dims_mapping(), dims_mapping);
+  EXPECT_EQ(dist_densor_type.partial_status(), partial_status);
+  EXPECT_EQ(dist_densor_type.dtype().isa<pir::Float32Type>(), true);
+  EXPECT_EQ(dist_densor_type.global_ddim(), dims);
+  EXPECT_EQ(dist_densor_type.data_layout(), data_layout);
+  EXPECT_EQ(dist_densor_type.local_ddim(), dims);
+}

From 12d1ecbe8ba378fb4d5120fa0e7938e1e5c70edf Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Fri, 1 Mar 2024 10:24:19 +0800
Subject: [PATCH 39/55] [SOT][3.12] add `LOAD_FAST_CHECK` OpCode (#62218)

---
 .../jit/sot/opcode_translator/executor/opcode_executor.py      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 7d58a78a9322d..3dfa9fb1b733b 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -809,6 +809,9 @@ def LOAD_FAST(self, instr: Instruction):
         var = self._locals[instr.argval]
         self.stack.push(var)
 
+    def LOAD_FAST_CHECK(self, instr: Instruction):
+        self.LOAD_FAST(instr)
+
     def DELETE_FAST(self, instr: Instruction):
         varname = self._code.co_varnames[instr.arg]
         del self._locals[varname]

From 7a0807f231b4e33bad8cab6af8cda85e5763f88e Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Fri, 1 Mar 2024 10:53:17 +0800
Subject: [PATCH 40/55] [PIR][DynamicShape] Fix Gather Op and Shape Op && Add
 BC_binary Ops' inferSymbolic shape (#62248)

* add gather

* add binary

* fix pd.shape && cinn.concat
---
 .../infer_symbolic_shape/cinn_op_infer_sym.cc |  19 ++
 .../infer_sym_element_wise_binary.cc          |  97 ++++++--
 .../infer_sym_element_wise_binary.h           |  55 +++--
 .../paddle_op_infer_sym.cc                    | 214 +++++++-----------
 .../paddle_op_infer_sym.h                     |  36 ---
 .../same_operands_and_result.cc               |   4 +
 .../same_operands_and_result.h                |   2 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |   1 +
 8 files changed, 218 insertions(+), 210 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index 0e8240434e070..f81624427207e 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -41,6 +41,25 @@ bool ConcatOpInferSymbolicShape(
   const auto input_values = op->operands_source();
   const auto input_size = input_values.size();
 
+  if (shape_analysis->GetShapeOrDataForValue(input_values[0])
+          .data()
+          .has_value()) {
+    std::vector<symbol::DimExpr> out_data;
+    for (const auto &value : input_values) {
+      const auto &shape_or_data = shape_analysis->GetShapeOrDataForValue(value);
+      for (size_t i = 0; i < shape_or_data.data().value().size(); ++i) {
+        out_data.emplace_back(shape_or_data.data().value()[i]);
+      }
+    }
+    const std::vector<symbol::DimExpr> shape{std::int64_t(out_data.size())};
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(shape, out_data)};
+
+    pir::Value res = op->result(0);
+    shape_analysis->SetShapeOrDataForValue(res, shape_data);
+    return true;
+  }
+
   int axis = op->attributes().at("axis").dyn_cast<pir::Int32Attribute>().data();
 
   const auto &GetOutDimExprs = [&]() -> std::vector<symbol::DimExpr> {
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
index 21da5351c617d..da8b68aefe206 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
@@ -79,27 +79,34 @@ bool InferSymbolicShapeElementWiseBinary(
 }
 
 namespace paddle::dialect {
-
 bool AddOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool Add_OpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool BitwiseAndOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool BitwiseAnd_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return BitwiseAndOpInferSymbolicShape(op, shape_analysis);
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool BitwiseXorOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool BitwiseXor_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool ComplexOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool DivideOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
@@ -108,42 +115,82 @@ bool Divide_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool ElementwisePowOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
+bool FmaxOpInferSymbolicShape(pir::Operation *op,
+                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool FminOpInferSymbolicShape(pir::Operation *op,
+                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool GreaterEqualOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool GreaterEqual_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
 bool GreaterThanOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool GreaterThan_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return GreaterThanOpInferSymbolicShape(op, shape_analysis);
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool LessEqualOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool LessEqual_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool LessThanOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool LessThan_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return LessThanOpInferSymbolicShape(op, shape_analysis);
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool LogicalAndOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool LogicalAnd_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return LogicalAndOpInferSymbolicShape(op, shape_analysis);
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool LogicalOrOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool LogicalOr_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool LogicalXorOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool LogicalXor_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool MaximumOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool MinimumOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool MultiplyOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
@@ -152,23 +199,29 @@ bool MultiplySrOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-bool Multiply_OpInferSymbolicShape(
+bool MultiplySr_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-bool MultiplySr_OpInferSymbolicShape(
+bool Multiply_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool NotEqualOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-
 bool NotEqual_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return NotEqualOpInferSymbolicShape(op, shape_analysis);
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool RemainderOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool Remainder_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
 
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
index e15d769fc8b02..be23d3cb20d9f 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
@@ -19,58 +19,75 @@
 namespace paddle::dialect {
 bool AddOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool Add_OpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool BitwiseAndOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool BitwiseAnd_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
+bool BitwiseXorOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool BitwiseXor_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool ComplexOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool DivideOpInferSymbolicShape(pir::Operation *op,
                                 pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool Divide_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool ElementwisePowOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
+bool FmaxOpInferSymbolicShape(pir::Operation *op,
+                              pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool FminOpInferSymbolicShape(pir::Operation *op,
+                              pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool GreaterEqualOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool GreaterEqual_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool GreaterThanOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool GreaterThan_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
+bool LessEqualOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool LessEqual_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool LessThanOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool LessThan_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool LogicalAndOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool LogicalAnd_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
+bool LogicalOrOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool LogicalOr_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool LogicalXorOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool LogicalXor_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool MaximumOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool MinimumOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool MultiplyOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool MultiplySrOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool Multiply_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool MultiplySr_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
+bool Multiply_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool NotEqualOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool NotEqual_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool RemainderOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool Remainder_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 6f4a4dacd7ba2..d95f109563518 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -59,20 +59,12 @@ bool ShapeOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
-
-  const std::vector<symbol::DimExpr> sym_shape = [&] {
-    std::vector<symbol::DimExpr> sym_shape;
-    symbol::DimExpr dim_expr(
-        op->result(0).type().dyn_cast<pir::DenseTensorType>().dims()[0]);
-    sym_shape.emplace_back(dim_expr);
-    return sym_shape;
-  }();
-
-  symbol::ShapeOrDataDimExprs shape_or_data{symbol::TensorShapeOrDataDimExprs(
-      sym_shape, operand_shape_or_data.shape())};
+  const auto &out_data = operand_shape_or_data.shape();
+  const std::vector<symbol::DimExpr> shape{std::int64_t(out_data.size())};
+  symbol::ShapeOrDataDimExprs shape_or_data{
+      symbol::TensorShapeOrDataDimExprs(shape, out_data)};
 
   shape_analysis->SetShapeOrDataForValue(op->result(0), shape_or_data);
-
   return true;
 }
 
@@ -511,25 +503,21 @@ bool ConcatOpInferSymbolicShape(
 
 bool GatherNdOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  auto x_shape_or_data =
+  const auto &x_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
-  auto index_shape_or_data =
+  const auto &index_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
 
-  std::vector<symbol::DimExpr> x_sym_shape;
-  if (x_shape_or_data.data().has_value()) {
-    x_sym_shape = x_shape_or_data.data().value();
-  } else {
-    x_sym_shape = x_shape_or_data.shape();
-  }
-  int x_dims_size = x_sym_shape.size();
+  const std::vector<symbol::DimExpr> &x_sym_shape =
+      x_shape_or_data.data().has_value() ? x_shape_or_data.data().value()
+                                         : x_shape_or_data.shape();
 
-  std::vector<symbol::DimExpr> index_sym_shape;
-  if (index_shape_or_data.data().has_value()) {
-    index_sym_shape = index_shape_or_data.data().value();
-  } else {
-    index_sym_shape = index_shape_or_data.shape();
-  }
+  const std::vector<symbol::DimExpr> &index_sym_shape =
+      index_shape_or_data.data().has_value()
+          ? index_shape_or_data.data().value()
+          : index_shape_or_data.shape();
+
+  int x_dims_size = x_sym_shape.size();
   int index_dims_size = index_sym_shape.size();
 
   std::vector<symbol::DimExpr> result_sym_dims;
@@ -1159,26 +1147,6 @@ bool AsStridedOpInferSymbolicShape(
   return true;
 }
 
-bool BitwiseXorOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool BitwiseXor_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool ComplexOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
 bool CummaxOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1234,22 +1202,70 @@ bool DirichletOpInferSymbolicShape(
   return true;
 }
 
-bool FmaxOpInferSymbolicShape(pir::Operation *op,
-                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool FminOpInferSymbolicShape(pir::Operation *op,
-                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
 bool GatherOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  const auto &input_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  const auto &index_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
+
+  const auto &numel = [&] {
+    symbol::DimExpr numel{1};
+    for (const auto &dim_expr : index_shape_or_data.shape()) {
+      numel = numel * dim_expr;
+    }
+    return numel;
+  }();
+
+  const auto &axis_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(2));
+
+  const std::vector<symbol::DimExpr> &input_sym_shape =
+      input_shape_or_data.data().has_value()
+          ? input_shape_or_data.data().value()
+          : input_shape_or_data.shape();
+
+  const std::vector<symbol::DimExpr> &index_sym_shape =
+      index_shape_or_data.data().has_value()
+          ? index_shape_or_data.data().value()
+          : index_shape_or_data.shape();
+
+  int axis =
+      static_cast<int>(axis_shape_or_data.data().value()[0].Get<int64_t>());
+  if (axis < 0) axis += input_sym_shape.size();
+
+  const auto &out_sym_shape = [&] {
+    std::vector<symbol::DimExpr> out_sym_shape;
+
+    if (index_sym_shape.size() == 0) {
+      if (input_sym_shape.size() == 1) {
+        out_sym_shape.push_back(symbol::DimExpr{0});
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          out_sym_shape.push_back(input_sym_shape[i]);
+        }
+        for (size_t i = axis + 1; i < input_sym_shape.size(); ++i) {
+          out_sym_shape.push_back(input_sym_shape[i]);
+        }
+      }
+    } else {
+      for (int i = 0; i < axis; ++i) {
+        out_sym_shape.push_back(input_sym_shape[i]);
+      }
+      out_sym_shape.push_back(numel);
+      for (size_t i = axis + 1; i < input_sym_shape.size(); ++i) {
+        out_sym_shape.push_back(input_sym_shape[i]);
+      }
+    }
+    return out_sym_shape;
+  }();
+
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(out_sym_shape)};
+
+  pir::Value res = op->result(0);
+  shape_analysis->SetShapeOrDataForValue(res, shape_data);
+
   return true;
 }
 
@@ -1272,30 +1288,6 @@ bool LogcumsumexpOpInferSymbolicShape(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
   return true;
 }
-bool LogicalOrOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool LogicalOr_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool LogicalXorOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool LogicalXor_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
 
 bool MaskedSelectOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
@@ -1379,30 +1371,7 @@ bool GaussianOpInferSymbolicShape(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
   return true;
 }
-bool GreaterEqualOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool GreaterEqual_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool LessEqualOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool LessEqual_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
+
 bool LinspaceOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1421,24 +1390,14 @@ bool LogsumexpOpInferSymbolicShape(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
   return true;
 }
-bool MaximumOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
+
 bool MinOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
   return true;
 }
-bool MinimumOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
+
 bool PadOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1451,18 +1410,7 @@ bool RandintOpInferSymbolicShape(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
   return true;
 }
-bool RemainderOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool Remainder_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
+
 bool RepeatInterleaveOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
index a13d93486b140..cf5e650023fa9 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
@@ -126,13 +126,6 @@ bool AsRealOpInferSymbolicShape(pir::Operation *op,
 bool AsStridedOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool BitwiseXorOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool BitwiseXor_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool ComplexOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool CummaxOpInferSymbolicShape(pir::Operation *op,
                                 pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool CumminOpInferSymbolicShape(pir::Operation *op,
@@ -153,10 +146,6 @@ bool DiagonalOpInferSymbolicShape(
 bool DirichletOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool FmaxOpInferSymbolicShape(pir::Operation *op,
-                              pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool FminOpInferSymbolicShape(pir::Operation *op,
-                              pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool GatherOpInferSymbolicShape(pir::Operation *op,
                                 pir::ShapeConstraintIRAnalysis *shape_analysis);
 
@@ -167,15 +156,6 @@ bool KthvalueOpInferSymbolicShape(
 
 bool LogcumsumexpOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool LogicalOrOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool LogicalOr_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool LogicalXorOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool LogicalXor_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool MaskedSelectOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool PoissonOpInferSymbolicShape(
@@ -206,34 +186,18 @@ bool Exponential_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool GaussianOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool GreaterEqualOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool GreaterEqual_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool LessEqualOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool LessEqual_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool LinspaceOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool LogspaceOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool LogsumexpOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool MaximumOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool MinOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool MinimumOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool PadOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool RandintOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool RemainderOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool Remainder_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool RepeatInterleaveOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool SplitWithNumOpInferSymbolicShape(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
index 31fe14209cc61..68ca785e0fbb0 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
@@ -290,6 +290,10 @@ bool Pow_OpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
 }
+bool PrintOpInferSymbolicShape(pir::Operation *op,
+                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
 bool RealOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
index 32941dd0c6f78..c671d9da22818 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
@@ -149,6 +149,8 @@ bool PowOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool Pow_OpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool PrintOpInferSymbolicShape(pir::Operation *op,
+                               pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool RealOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool ReluOpInferSymbolicShape(pir::Operation *op,
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 22bae4a65ab9a..7e05e5b79de8d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -1070,6 +1070,7 @@
   kernel :
     func : print_kernel
     param: [in, first_n, message, summarize, print_tensor_name, print_tensor_type, print_tensor_shape, print_tensor_layout, print_tensor_lod, print_phase, is_forward]
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : prod
   args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all)

From 600c058f92bc80bb5d9eff1512734c3b43ee6a93 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Fri, 1 Mar 2024 10:54:45 +0800
Subject: [PATCH 41/55] [clang-tidy] NO.17 enable
 cppcoreguidelines-explicit-virtual-functions,modernize-use-override (#61714)

* clangtidy 17

* fix
---
 paddle/fluid/framework/details/graph_test_base.h |  6 +++---
 paddle/fluid/framework/ir/graph_test.cc          |  4 ++--
 paddle/fluid/framework/ir/pass_test.cc           |  4 ++--
 .../fluid/ir_adaptor/translator/op_translator.cc |  2 +-
 test/cpp/fluid/framework/op_proto_maker_test.cc  |  6 +++---
 test/cpp/fluid/framework/operator_test.cc        | 16 ++++++++--------
 .../fluid/framework/var_type_inference_test.cc   |  2 +-
 test/cpp/pir/core/add_dialect_parser_test.cc     |  2 +-
 8 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index 2f50556e771ee..09d7dcc863aed 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -44,7 +44,7 @@ class DummyOp : public OperatorBase {
 
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "");
     AddComment("");
@@ -53,7 +53,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class AssignOpMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "");
     AddComment("");
@@ -62,7 +62,7 @@ class AssignOpMaker : public OpProtoAndCheckerMaker {
 
 class SplitOpMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "");
     AddOutput("Out", "").AsDuplicable();
     AddComment("");
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index b8ad98113a3a4..4654abe6eb48d 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -38,7 +38,7 @@ class NOP : public OperatorBase {
 
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "").AsDuplicable();
     AddComment("");
@@ -60,7 +60,7 @@ class SumOpVarTypeInference : public VarTypeInference {
 
 class DummyOpMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "").AsDuplicable();
     AddComment("");
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 2d13a912d6cca..4c3d19f51e73f 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -43,7 +43,7 @@ void BuildCircleGraph(Graph* g) {
 
 class TestPass : public Pass {
  protected:
-  void ApplyImpl(ir::Graph* graph) const {
+  void ApplyImpl(ir::Graph* graph) const override {
     graph->Set<int>("copy_test_pass_attr", new int);
     graph->Set<int>("copy_test_graph_attr", new int);
 
@@ -226,7 +226,7 @@ TEST(PassTest, TestPassAttrCheckConvertAllBlocks) {
 
 class TestPassWithDefault : public Pass {
  protected:
-  void ApplyImpl(ir::Graph* graph) const {
+  void ApplyImpl(ir::Graph* graph) const override {
     graph->Set<int>("copy_default_attr", new int);
 
     int test_pass_attr = this->Get<int>("default_attr");
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index c4ad629fc3d91..b7081609f2f90 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -2722,7 +2722,7 @@ struct RandIntOpTranscriber : public OpTranscriber {
   std::tuple<OpOutputTypeList, OpOutputMapping> GenerateOperationOutput(
       pir::IrContext* ctx,
       const OpDesc& op_desc,
-      const OpOutputInfoList& output_infos) {
+      const OpOutputInfoList& output_infos) override {
     OpOutputMapping arg_to_idx;
     OpOutputTypeList op_output_types = {};
 
diff --git a/test/cpp/fluid/framework/op_proto_maker_test.cc b/test/cpp/fluid/framework/op_proto_maker_test.cc
index bc25e34d8139a..7c2301cded0ce 100644
--- a/test/cpp/fluid/framework/op_proto_maker_test.cc
+++ b/test/cpp/fluid/framework/op_proto_maker_test.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddAttr<float>("scale", "scale of test op");
     AddAttr<float>("scale", "scale of test op");
   }
@@ -37,7 +37,7 @@ TEST(ProtoMaker, DuplicatedAttr) {
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("input", "input of test op");
     AddInput("input", "input of test op");
   }
@@ -54,7 +54,7 @@ TEST(ProtoMaker, DuplicatedInOut) {
 class OpProtoMakerWithScalar
     : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddAttr<paddle::experimental::Scalar>("generic_scalar",
                                           "generic_scalar of test op");
     AddAttr<std::vector<paddle::experimental::Scalar>>(
diff --git a/test/cpp/fluid/framework/operator_test.cc b/test/cpp/fluid/framework/operator_test.cc
index d40a45ae5172a..b83127a239dbf 100644
--- a/test/cpp/fluid/framework/operator_test.cc
+++ b/test/cpp/fluid/framework/operator_test.cc
@@ -51,7 +51,7 @@ class OpWithoutKernelTest : public OperatorBase {
 
 class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
     AddAttr<float>("scale", "scale of cosine op");
@@ -106,7 +106,7 @@ static int special_type_value = 1;
 
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("x", "input of test op");
     AddOutput("y", "output of test op");
     AddAttr<float>("scale", "scale of cosine op")
@@ -161,7 +161,7 @@ class CPUKernel2Test : public OpKernel<float> {
 class OpKernelTestMultiInputsProtoAndCheckerMaker
     : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("xs", "inputs of test op").AsDuplicable();
     AddInput("k", "input of test op");
     AddOutput("ys", "outputs of test op").AsDuplicable();
@@ -335,7 +335,7 @@ class IndicateLoDTensorDataTypeTest : public OperatorWithKernel {
 
 class IndicateLoDTensorDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("phi::DenseTensor", "Input of phi::DenseTensor type Variable.");
     AddComment("This Op is only for IndicateVarDataType interface test.");
   }
@@ -357,7 +357,7 @@ class IndicateSelectedRowsDataTypeTest : public OperatorWithKernel {
 class IndicateSelectedRowsDataTypeTestProtoMaker
     : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("SelectedRows", "Input of SelectedRows type Variable.");
     AddComment("This Op is only for IndicateVarDataType interface test.");
   }
@@ -377,7 +377,7 @@ class IndicateOtherDataTypeTest : public OperatorWithKernel {
 };
 class IndicateOtherDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("Other", "Input of Other type Variable");
     AddComment("This Op is only for IndicateVarDataType interface test.");
   }
@@ -512,7 +512,7 @@ class SetLoDLevelTest : public OperatorWithKernel {
 
 class GetSetLoDLevelTestMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "(phi::DenseTensor) Input Variable.");
     AddOutput("Out", "(phi::DenseTensor) Output Variable.");
     AddComment("This Op is only for Get/SetLoDLevel interface test.");
@@ -592,7 +592,7 @@ class OpUnusedVarTest : public OperatorWithKernel {
 
 class OpUnusedVarTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "input of test op");
     AddOutput("Y", "output of test op");
     AddComment("This is test op for unused var check.");
diff --git a/test/cpp/fluid/framework/var_type_inference_test.cc b/test/cpp/fluid/framework/var_type_inference_test.cc
index b7f7f32348ec6..6a310843e95e5 100644
--- a/test/cpp/fluid/framework/var_type_inference_test.cc
+++ b/test/cpp/fluid/framework/var_type_inference_test.cc
@@ -41,7 +41,7 @@ class NOP : public OperatorBase {
 
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
-  void Make() {
+  void Make() override {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "");
     AddComment("");
diff --git a/test/cpp/pir/core/add_dialect_parser_test.cc b/test/cpp/pir/core/add_dialect_parser_test.cc
index 5a64b28a5cbd6..1b6ae533ffa16 100644
--- a/test/cpp/pir/core/add_dialect_parser_test.cc
+++ b/test/cpp/pir/core/add_dialect_parser_test.cc
@@ -37,7 +37,7 @@ class TestParserDialect : public pir::Dialect {
 
   static const char* name() { return "tp"; }
 
-  void PrintAttribute(pir::Attribute attr, std::ostream& os) const;
+  void PrintAttribute(pir::Attribute attr, std::ostream& os) const;  // NOLINT
 
   pir::Attribute ParseAttribute(pir::IrParser& parser);  // NOLINT
 

From 1ea6a51857fc9b3d47ab17a6eb47827c056f072d Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Fri, 1 Mar 2024 10:56:10 +0800
Subject: [PATCH 42/55]  [clang-tidy] NO.3
 bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions PART 2
 (#62109)

---
 .../collective/process_group_nccl.cc          |  4 +++-
 .../distributed/test/ctr_accessor_test.cc     |  8 +++----
 .../fluid/framework/downpour_lite_worker.cc   |  3 ++-
 paddle/fluid/framework/downpour_worker.cc     |  5 ++--
 paddle/fluid/framework/fleet/gloo_wrapper.cc  |  4 ++--
 paddle/fluid/framework/fleet/metrics.cc       |  2 +-
 .../ir/mkldnn/cpu_bfloat16_pass_tester.cc     |  4 ++--
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  | 16 +++++++------
 ...ant_transpose2_dequant_onednn_fuse_pass.cc |  2 +-
 .../ir/trt_skip_layernorm_fuse_pass.cc        |  3 ++-
 .../analysis/ir_passes/lite_subgraph_pass.cc  |  2 +-
 paddle/fluid/inference/api/analysis_config.cc |  8 ++++---
 .../allocation/cuda_managed_allocator.cc      |  2 +-
 .../memory/allocation/system_allocator.cc     |  3 ++-
 .../fluid/operators/fused/resnet_unit_op.cc   |  2 +-
 .../operators/mkldnn/reshape_mkldnn_op.cc     |  6 ++---
 .../operator/utils/op_yaml_info_parser.cc     |  2 +-
 paddle/fluid/platform/gen_comm_id_helper.cc   |  4 ++--
 paddle/fluid/platform/profiler/utils.cc       | 11 +++++----
 paddle/fluid/pybind/eager_utils.cc            |  6 ++---
 paddle/fluid/pybind/imperative.cc             |  5 ++--
 paddle/phi/api/profiler/device_tracer.cc      |  8 +++----
 paddle/phi/api/profiler/profiler.cc           |  2 +-
 paddle/phi/backends/device_base.cc            |  6 ++---
 paddle/phi/backends/device_code.cc            |  3 ++-
 paddle/phi/backends/gpu/cuda/cuda_info.cc     |  2 +-
 paddle/phi/backends/gpu/gpu_info.cc           |  2 +-
 paddle/phi/infermeta/binary.cc                |  8 +++----
 paddle/phi/infermeta/multiary.cc              |  4 ++--
 .../phi/infermeta/spmd_rules/elementwise.cc   | 24 +++++++++----------
 paddle/phi/infermeta/spmd_rules/reduction.cc  |  8 +++----
 paddle/phi/infermeta/spmd_rules/replicated.cc | 10 ++++----
 paddle/phi/infermeta/spmd_rules/softmax.cc    |  6 ++---
 paddle/phi/infermeta/spmd_rules/unsqueeze.cc  |  8 +++----
 paddle/phi/infermeta/spmd_rules/utils.cc      |  7 +++---
 paddle/phi/kernels/funcs/jit/gen/blas.cc      |  2 +-
 paddle/phi/kernels/funcs/jit/gen/gru.cc       |  2 +-
 paddle/phi/kernels/funcs/jit/gen/lstm.cc      |  2 +-
 .../fusion/onednn/fused_transpose_kernel.cc   |  6 ++---
 .../phi/kernels/onednn/concat_grad_kernel.cc  |  4 ++--
 .../phi/kernels/onednn/expand_grad_kernel.cc  |  2 +-
 .../phi/kernels/onednn/matmul_grad_kernel.cc  |  6 +++--
 paddle/phi/kernels/onednn/matmul_kernel.cc    |  4 ++--
 .../phi/kernels/onednn/slice_grad_kernel.cc   |  2 +-
 paddle/phi/kernels/onednn/slice_kernel.cc     |  2 +-
 .../phi/kernels/onednn/squeeze_grad_kernel.cc |  2 +-
 .../cpp/fluid/fused/cudnn_bn_add_relu_test.cc |  2 +-
 test/cpp/fluid/memory/buddy_allocator_test.cc |  8 +++----
 test/cpp/imperative/test_group.cc             |  4 ++--
 test/cpp/inference/api/analyzer_dam_tester.cc |  2 +-
 .../analyzer_int8_object_detection_tester.cc  |  2 +-
 .../analyzer_lexical_analysis_gru_tester.cc   |  2 +-
 .../cpp/phi/kernels/test_fused_adam_kernel.cc |  2 +-
 test/cpp/phi/kernels/test_memcpy_dev_api.cc   |  2 +-
 54 files changed, 138 insertions(+), 120 deletions(-)

diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
index 82e95204590bd..f38fe1207c199 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -528,7 +528,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Gather(
     size_t offset = 0;
     size_t numel = out_tensor->numel() / size_;
     for (auto i = 0; i < size_; i++) {
-      partial_tensors.push_back(GetPartialTensor(*out_tensor, offset, numel));
+      partial_tensors.push_back(GetPartialTensor(*out_tensor,
+                                                 static_cast<int64_t>(offset),
+                                                 static_cast<int64_t>(numel)));
       offset += numel;
     }
   }
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 9b71e4524625c..0288a93d71a96 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -79,7 +79,7 @@ TEST(downpour_feature_value_accessor_test, test_shrink) {
 
   float* value = new float[acc->GetAccessorInfo().dim];
   for (auto i = 0u; i < acc->GetAccessorInfo().dim; ++i) {
-    value[i] = i * 1.0;
+    value[i] = static_cast<float>(i) * 1.0;
   }
   ASSERT_TRUE(!acc->Shrink(value));
 
@@ -98,7 +98,7 @@ TEST(downpour_feature_value_accessor_test, test_save) {
 
   float* value = new float[acc->GetAccessorInfo().dim];
   for (auto i = 0u; i < acc->GetAccessorInfo().dim; ++i) {
-    value[i] = i * 1.0;
+    value[i] = static_cast<float>(i) * 1.0;
   }
 
   // save all feature
@@ -166,7 +166,7 @@ TEST(downpour_feature_value_accessor_test, test_update) {
   for (auto i = 0u; i < item_size; ++i) {
     float* p = new float[acc->GetAccessorInfo().update_dim];
     for (auto j = 0u; j < acc->GetAccessorInfo().update_dim; ++j) {
-      p[j] = i + 1;
+      p[j] = static_cast<float>(i) + 1.0;
     }
     grad[i] = p;
   }
@@ -288,7 +288,7 @@ TEST(downpour_feature_value_accessor_test, test_string_related) {
   const int field_size = 15;
   float* value = new float[field_size];
   for (auto i = 0u; i < field_size; ++i) {
-    value[i] = i;
+    value[i] = static_cast<float>(i);
   }
 
   auto str = acc->ParseToString(value, 0);
diff --git a/paddle/fluid/framework/downpour_lite_worker.cc b/paddle/fluid/framework/downpour_lite_worker.cc
index 3d453c018c1d5..e86856bf1b2ff 100644
--- a/paddle/fluid/framework/downpour_lite_worker.cc
+++ b/paddle/fluid/framework/downpour_lite_worker.cc
@@ -410,7 +410,8 @@ void DownpourLiteWorker::TrainFilesWithProfiler() {
         fprintf(stderr,
                 "push dense time percent: %f\n",
                 push_dense_time / total_time * 100);
-        fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
+        fprintf(
+            stderr, "%6.2f instances/s\n", total_inst / total_time);  // NOLINT
       }
     }
     timeline.Start();
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 6ce2967a08f1f..0d5bd66297c53 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -334,8 +334,9 @@ void DownpourWorker::AdjustInsWeight() {
     }
     float ins_weight = 1.0;
     if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E + (nid_adjw_threshold - nid_show) /
-                                 nid_adjw_threshold * nid_adjw_ratio);
+      ins_weight = static_cast<float>(
+          log(M_E + (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
+                        nid_adjw_ratio));
       // count nid adjw insnum and weight
       ++nid_adjw_num;
       nid_adjw_weight += ins_weight;
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index 277004b6dc164..421953ff8c02a 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -165,7 +165,7 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
       int32_t last_check_rank = -1;
       for (size_t i = 0; i < check_key_status.size(); ++i) {
         if (!check_key_status[i]) {
-          last_check_rank = i;
+          last_check_rank = static_cast<int32_t>(i);
           break;
         }
       }
@@ -252,7 +252,7 @@ void ParallelConnectContext::connectFullMesh(
     connect_threads[i].reset(new std::thread(
         [&store, &transportContext, total_add_size, this](
             size_t thread_idx, size_t thread_num) -> void {
-          for (int i = thread_idx; i < size; i += thread_num) {
+          for (int i = thread_idx; i < size; i += thread_num) {  // NOLINT
             if (i == rank) {
               continue;
             }
diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc
index 58e1e195fbab7..5801860f66566 100644
--- a/paddle/fluid/framework/fleet/metrics.cc
+++ b/paddle/fluid/framework/fleet/metrics.cc
@@ -301,7 +301,7 @@ void BasicAucCalculator::add_uid_unlock_data(double pred,
   WuaucRecord record;
   record.uid_ = uid;
   record.label_ = label;
-  record.pred_ = pred;
+  record.pred_ = static_cast<float>(pred);
   wuauc_records_.emplace_back(std::move(record));
 }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
index dfd838895aeb4..951d064364ce3 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -73,9 +73,9 @@ void MainTest(const ProgramDesc& prog,
   auto graph = std::make_unique<ir::Graph>(prog);
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
 
-  int original_nodes_num = graph->Nodes().size();
+  int original_nodes_num = static_cast<int>(graph->Nodes().size());
   graph.reset(pass->Apply(graph.release()));
-  int current_nodes_num = graph->Nodes().size();
+  int current_nodes_num = static_cast<int>(graph->Nodes().size());
 
   int quantize_nodes_count = 0;
   int dequantize_nodes_count = 0;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 2f1e7e8a53865..0e9c452455de3 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -94,8 +94,8 @@ void CPUQuantizePass::QuantizeInput(Graph* g,
                         "Var(%s) isn't the input of the %s operator.",
                         input_name,
                         op->Op()->Type()));
-  unsigned max = is_input_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_to_one * max;
+  unsigned max = is_input_unsigned ? U8_MAX : S8_MAX;  // NOLINT
+  float scale = static_cast<float>(scale_to_one) * max;
 
   // Create quantize output variable
   VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
@@ -175,12 +175,13 @@ void CPUQuantizePass::QuantizeInputs(Graph* g,
 
   double scale_out = GetScaleValueForNode(output);
   unsigned max = are_inputs_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_out * max;
+  float scale = static_cast<float>(scale_out) * max;
 
   for (size_t var_id = 0; var_id < unique_var_names.size(); var_id++) {
     auto index = -1;
     for (size_t it = 0; it < inputs.size(); it++) {
-      if (inputs[it]->Name() == unique_var_names[var_id]) index = it;
+      if (inputs[it]->Name() == unique_var_names[var_id])
+        index = static_cast<int>(it);
     }
 
     if (index == -1) {
@@ -249,7 +250,7 @@ void CPUQuantizePass::DequantizeOutput(Graph* g,
                         output_name,
                         op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_to_one * max;
+  float scale = static_cast<float>(scale_to_one) * max;
 
   // Create dequantize input variable
   VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
@@ -298,12 +299,13 @@ void CPUQuantizePass::DequantizeOutputs(Graph* g,
   std::vector<Node*> dequantize_in_nodes(outputs.size());
 
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_to_one * max;
+  float scale = static_cast<float>(scale_to_one) * max;
 
   for (size_t var_id = 0; var_id < var_names.size(); var_id++) {
     auto index = -1;
     for (size_t it = 0; it < outputs.size(); it++) {
-      if (outputs[it]->Name() == var_names[var_id]) index = it;
+      if (outputs[it]->Name() == var_names[var_id])
+        index = static_cast<int>(it);
     }
 
     if (index == -1) {
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc
index 09bebfaec99c3..b331cc996fffc 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_transpose2_dequant_onednn_fuse_pass.cc
@@ -137,7 +137,7 @@ void FuseQuantTranspose2DequantOneDNNPass::FuseTranspose2Dequantize(
         dequant_op->Op()->HasAttr("Scale")
             ? PADDLE_GET_CONST(float, dequant_op->Op()->GetAttr("Scale"))
             : 1;
-    float reorder_scale = 1.0 / scale;
+    float reorder_scale = static_cast<float>(1.0) / scale;
     float shift =
         dequant_op->Op()->HasAttr("Shift")
             ? PADDLE_GET_CONST(float, dequant_op->Op()->GetAttr("Shift"))
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index 81f96f2fc33f4..0708218dbd07c 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -218,7 +218,8 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
       }
       new_desc.SetAttr("begin_norm_axis", begin_norm_axis);
     }
-    int32_t hidden_size = layer_norm_scale->Var()->GetShape()[0];
+    int32_t hidden_size =
+        static_cast<int32_t>(layer_norm_scale->Var()->GetShape()[0]);
     new_desc.SetAttr("hidden_size", hidden_size);
 
     auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 2d484a943cf20..f8a4d4d15af72 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -71,7 +71,7 @@ std::vector<std::string> IOVarsFilter(const std::vector<Node*>& nodes) {
 
 void StrToBinaryFile(const std::string& path, const std::string& str) {
   std::ofstream file(path.c_str(), std::ios::binary);
-  file.write(str.c_str(), str.size());
+  file.write(str.c_str(), str.size());  // NOLINT
   file.close();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 0ec5151a92bc5..5987483220b8a 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -1232,11 +1232,13 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
   size_t gpu_total, gpu_available;
   platform::SetDeviceId(gpu_device_id_);
   platform::GpuMemoryUsage(&gpu_available, &gpu_total);
-  double total_gpu_memory = gpu_total / 1024. / 1024.;
+  double total_gpu_memory = static_cast<double>(gpu_total) / 1024. / 1024.;
   float fraction_of_gpu_memory =
-      static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
+      static_cast<float>(memory_pool_init_size_mb()) /
+      static_cast<float>(total_gpu_memory);
   VLOG(3) << "total_gpu_memory is " << total_gpu_memory
-          << "M, gpu_available is " << gpu_available / 1024. / 1024.
+          << "M, gpu_available is "
+          << static_cast<double>(gpu_available) / 1024. / 1024.
           << "M, memory_pool_init_size is " << memory_pool_init_size_mb()
           << "M.";
   return fraction_of_gpu_memory;
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 77ca495cacbc7..36659fdbadce2 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -65,7 +65,7 @@ phi::Allocation* CUDAManagedAllocator::AllocateImpl(size_t size) {
 
   std::string err_msg;
   if (UNLIKELY(is_limited)) {
-    int64_t limit_size_mb = limit_size >> 20;
+    int64_t limit_size_mb = limit_size >> 20;  // NOLINT
     err_msg = string::Sprintf(
         "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
         "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 4ca1f21c563fc..8fd7967e9752d 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -208,7 +208,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
   if (size > usable) {
     LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
                  << " MB pinned memory."
-                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+                 << ", available " << usable / 1024.0 / 1024.0
+                 << " MB";  // NOLINT
     return nullptr;
   }
 
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index f1f2628119c15..5827cd3427dee 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -27,7 +27,7 @@ static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
                                 std::multiplies<int>()) /  // NOLINT
                 c;
   int32_t c_int32_elems = ((c + 63) & ~63) / 32;
-  int32_t nhw_int32_elems = ((nhw + 31) & ~31);
+  int32_t nhw_int32_elems = static_cast<int32_t>(((nhw + 31) & ~31));
   std::vector<int> bitmask_shape = {nhw_int32_elems, c_int32_elems, 1};
   return common::make_ddim(bitmask_shape);
 }
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index 1e3b29da11e5b..8632160b04ae0 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -185,7 +185,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
                 "be -1. But received shape = [%s], shape[%d] is also -1.",
                 common::make_ddim(shape),
                 i));
-        unk_dim_idx = i;
+        unk_dim_idx = static_cast<int>(i);
       } else if (shape[i] == copy_dim_val) {
         PADDLE_ENFORCE_LT(
             static_cast<int>(i),
@@ -212,9 +212,9 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
                 shape[i]));
       }
 
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);  // NOLINT
       output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);  // NOLINT
     }
 
     if (unk_dim_idx != -1) {
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
index 7f84eac85bdb8..41140053a22f0 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
@@ -232,7 +232,7 @@ int OpYamlInfoParser::GetTensorParamIndexByArgsName(
                                kernel_fn_tensor_params_.end(),
                                args_name);
   if (iter != kernel_fn_tensor_params_.end()) {
-    return std::distance(kernel_fn_tensor_params_.begin(), iter);
+    return std::distance(kernel_fn_tensor_params_.begin(), iter);  // NOLINT
   } else {
     return -1;
   }
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index 40d80f8ef2cbc..ab10f799f68d1 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -82,7 +82,7 @@ static int SocketSend(int fd, const char* buffer, int size) {
   int offset = 0;
   int bytes = 0;
   while (offset < size) {
-    bytes = send(fd, buffer + offset, size - offset, 0);
+    bytes = send(fd, buffer + offset, size - offset, 0);  // NOLINT
     if (bytes == -1) {
       if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
         // send failed
@@ -100,7 +100,7 @@ static int SocketRecv(int fd, char* buffer, int size) {
   int offset = 0;
   int bytes = 0;
   while (offset < size) {
-    bytes = recv(fd, buffer + offset, size - offset, 0);
+    bytes = recv(fd, buffer + offset, size - offset, 0);  // NOLINT
     if (bytes == 0) {
       // closed by client, maybe probing alive client
       return 0;
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index 8c12f84416579..236c77cec5b22 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -106,7 +106,8 @@ float CalculateEstOccupancy(uint32_t DeviceId,
   float occupancy = 0.0;
   std::vector<int> device_ids = GetSelectedDevices();
   if (DeviceId < device_ids.size()) {
-    const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
+    const gpuDeviceProp& device_property =
+        GetDeviceProperties(static_cast<int>(DeviceId));
     cudaOccFuncAttributes occFuncAttr;
     occFuncAttr.maxThreadsPerBlock = INT_MAX;
     occFuncAttr.numRegs = RegistersPerThread;
@@ -127,11 +128,13 @@ float CalculateEstOccupancy(uint32_t DeviceId,
                                                 blockSize,
                                                 dynamicSmemSize);
     if (status == CUDA_OCC_SUCCESS) {
-      if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
-        BlocksPerSm = occ_result.activeBlocksPerMultiprocessor;
+      if (static_cast<float>(occ_result.activeBlocksPerMultiprocessor) <
+          BlocksPerSm) {
+        BlocksPerSm =
+            static_cast<float>(occ_result.activeBlocksPerMultiprocessor);
       }
       occupancy =
-          BlocksPerSm * blockSize /
+          BlocksPerSm * static_cast<float>(blockSize) /
           static_cast<float>(device_property.maxThreadsPerMultiProcessor);
     } else {
       LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index d613c008b4958..c6a2db061594b 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -518,7 +518,7 @@ std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos) {
   } else if (obj == Py_None) {
     return {};
   } else if (PyObject_CheckLongOrConvertToLong(&obj)) {
-    return {static_cast<int64_t>(PyLong_AsLong(obj))};
+    return {static_cast<int64_t>(PyLong_AsLong(obj))};  // NOLINT
   } else {
     PADDLE_THROW(platform::errors::InvalidType(
         "argument (position %d) must be "
@@ -566,7 +566,7 @@ std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos) {
   } else if (obj == Py_None) {
     return {};
   } else if (PyObject_CheckLongOrConvertToLong(&obj)) {
-    return {PyLong_AsSize_t(obj)};
+    return {PyLong_AsSize_t(obj)};  // NOLINT
   } else {
     PADDLE_THROW(platform::errors::InvalidType(
         "argument (position %d) must be "
@@ -614,7 +614,7 @@ std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos) {
   } else if (obj == Py_None) {
     return {};
   } else if (PyObject_CheckFloatOrConvertToFloat(&obj)) {
-    return {static_cast<float>(PyFloat_AsDouble(obj))};
+    return {static_cast<float>(PyFloat_AsDouble(obj))};  // NOLINT
   } else {
     PADDLE_THROW(platform::errors::InvalidType(
         "argument (position %d) must be "
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index c540fe0687d88..288a05d638b73 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1357,8 +1357,9 @@ void BindImperative(py::module *m_ptr) {
           auto *index_data = index_tensor.data<int64_t>();
           auto *buffer_data =
               buffer_tensor->mutable_data<float>(buffer_tensor->place());
-          const int &slice_size = src_tensor.numel() / src_tensor.dims()[0];
-          const int &copy_bytes = slice_size * sizeof(float);
+          const int &slice_size =
+              static_cast<int>(src_tensor.numel()) / src_tensor.dims()[0];
+          const int &copy_bytes = static_cast<int>(slice_size) * sizeof(float);
           int64_t c = 0;
           for (int64_t i = 0; i < index_tensor.numel(); i++) {
             std::memcpy(buffer_data + c * slice_size,
diff --git a/paddle/phi/api/profiler/device_tracer.cc b/paddle/phi/api/profiler/device_tracer.cc
index f15d6bbb88457..748eedff4ee6d 100644
--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -571,10 +571,10 @@ class DeviceTracerImpl : public DeviceTracer {
         Event *e = c->second;
         Event *parent = e->parent();
         while (parent) {
-          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
+          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);  // NOLINT
           parent = parent->parent();
         }
-        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);  // NOLINT
       }
     }
     for (const auto &r : mem_records_) {
@@ -583,10 +583,10 @@ class DeviceTracerImpl : public DeviceTracer {
         Event *e = c->second;
         Event *parent = e->parent();
         while (parent) {
-          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
+          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);  // NOLINT
           parent = parent->parent();
         }
-        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);  // NOLINT
       }
     }
 #endif
diff --git a/paddle/phi/api/profiler/profiler.cc b/paddle/phi/api/profiler/profiler.cc
index 6dc419658d3c2..e9c49741a5e6b 100644
--- a/paddle/phi/api/profiler/profiler.cc
+++ b/paddle/phi/api/profiler/profiler.cc
@@ -77,7 +77,7 @@ double Event::CpuElapsedMs(const Event &e) const {
 
 double Event::CudaElapsedMs(const Event &e) const {
 #ifdef PADDLE_WITH_CUPTI
-  return gpu_ns_ / 1000000.0;
+  return static_cast<double>(gpu_ns_) / 1000000.0;
 #else
   LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
   return 0;
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index f27919bef05fe..7860d322f1faa 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -215,9 +215,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul
-           ? flag_mb << 20
-           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul ? flag_mb << 20
+                     : available_to_alloc *
+                           FLAGS_fraction_of_gpu_memory_to_use);  // NOLINT
   PADDLE_ENFORCE_GE(available_to_alloc,
                     alloc_bytes,
                     phi::errors::ResourceExhausted(
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index 670e0e3781598..e2016ff78b7c3 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -186,7 +186,8 @@ static std::string FindCUDAIncludePath() {
     }
     for (std::string suffix : {"/lib", "/lib64"}) {
       if (EndWith(FLAGS_cuda_dir, suffix)) {
-        cuda_include_path.erase(cuda_include_path.end() - suffix.length());
+        cuda_include_path.erase(cuda_include_path.end() -
+                                suffix.length());  // NOLINT
         break;
       }
     }
diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
index 0af1beb782fcf..505fc7f3f6cd6 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
@@ -28,7 +28,7 @@ namespace gpu {
 
 int DnnVersion() {
   if (!dynload::HasCUDNN()) return -1;
-  return dynload::cudnnGetVersion();
+  return dynload::cudnnGetVersion();  // NOLINT
 }
 
 static int GetGPUDeviceCountImpl() {
diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc
index 96048de5c047c..32546f762c39e 100644
--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -66,7 +66,7 @@ size_t GpuAvailableMemToAlloc() {
   size_t available = 0;
   memory_utils::GpuMemoryUsage(&available, &total);
   size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);  // NOLINT
   // If available size is less than minimum chunk size, no usable memory exists
   size_t available_to_alloc = available - reserving;
   size_t min_chunk_size = GpuMinChunkSize();
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index fdef52a5fb6e1..ce47a88c420df 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -166,8 +166,8 @@ void ArrayReadInferMeta(const MetaTensor& array,
     out->set_dims({-1});
   } else {
     double index = i.to<int64_t>();
-    out->set_dims(array.dims(index));
-    out->share_lod(array, index);
+    out->set_dims(array.dims(index));  // NOLINT
+    out->share_lod(array, index);      // NOLINT
   }
   out->set_dtype(array.dtype());
   out->set_layout(array.layout());
@@ -3557,8 +3557,8 @@ void WeightDequantizeInferMeta(const MetaTensor& x,
                                 dim_scale[0],
                                 (x.dims()[1] + (group_size - 1)) / group_size));
   }
-  int n = x.dims()[1];
-  int k = x.dims()[0];
+  int n = static_cast<int>(x.dims()[1]);
+  int k = static_cast<int>(x.dims()[0]);
   out->set_dims(common::make_ddim({n, k}));
   out->set_dtype(out_dtype);
 }
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index bb57e5a813aa7..7575cc3cf1434 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -4706,8 +4706,8 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
   int v_num_head = k_num_head;
   int dim_head = static_cast<int>(cache_kv.dims()[4]);
   // below's num_head is q's head actually.
-  int num_head =
-      x.dims()[x.dims().size() - 1] / dim_head - k_num_head - v_num_head;
+  int num_head = x.dims()[x.dims().size() - 1] / dim_head - k_num_head -
+                 v_num_head;  // NOLINT
 
   PADDLE_ENFORCE_EQ(
       num_head % k_num_head,
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
index 3db396de8b613..d558dfa69b7b5 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.cc
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -31,7 +31,7 @@ std::string GetInputBroadcastNotation(const std::vector<int64_t>& shape,
                                       const int max_ndim,
                                       const std::string& alphabet,
                                       std::vector<int>* broadcast_axis_count) {
-  int ndim = shape.size();
+  int ndim = static_cast<int>(shape.size());
   int start_dim = max_ndim - ndim;
   std::string axes_notation = GetBroadcastAxes(ndim, max_ndim, alphabet);
 
@@ -54,8 +54,8 @@ void GetBinaryNotations(const std::vector<int64_t>& x_shape,
                         std::string* x_axes,
                         std::string* y_axes,
                         std::string* out_axes) {
-  int x_ndim = x_shape.size();
-  int y_ndim = y_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
+  int y_ndim = static_cast<int>(y_shape.size());
   int max_ndim = std::max(x_ndim, y_ndim);
   int ninputs = 2;
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
@@ -82,7 +82,7 @@ void GetBinaryNotations(const std::vector<int64_t>& x_shape,
 SpmdInfo ElementwiseUnaryInferSpmd(const DistMetaTensor& x) {
   // Step0: Verify Input Args Based on Elementwise Logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   TensorDistAttr x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(x_ndim,
@@ -129,7 +129,7 @@ SpmdInfo ElementwiseUnaryInferSpmd(const DistMetaTensor& x) {
 SpmdInfo ElementwiseUnaryWithPartialInferSpmd(const DistMetaTensor& x) {
   // Step0: Verify Input Args Based on Elementwise Logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   TensorDistAttr x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(x_ndim,
@@ -177,9 +177,9 @@ SpmdInfo ElementwiseUnaryInferSpmdReverse(const DistMetaTensor& x,
                                           const DistMetaTensor& out) {
   // Step0: Verify Input Args Based on Elementwise Logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto out_shape = common::vectorize(out.dims());
-  int out_ndim = out_shape.size();
+  int out_ndim = static_cast<int>(out_shape.size());
   TensorDistAttr out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
@@ -233,9 +233,9 @@ SpmdInfo ElementwiseBinaryInferSpmd(const DistMetaTensor& x,
                                     const DistMetaTensor& y) {
   // Step0: Verify Input Args Based on Elementwise Logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto y_shape = common::vectorize(y.dims());
-  int y_ndim = y_shape.size();
+  int y_ndim = static_cast<int>(y_shape.size());
   TensorDistAttr x_dist_attr_src = x.dist_attr();
   TensorDistAttr y_dist_attr_src = y.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -303,11 +303,11 @@ SpmdInfo ElementwiseBinaryInferSpmdReverse(const DistMetaTensor& x,
                                            const DistMetaTensor& out) {
   // Step0: Verify Input Args Based on Elementwise Logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto y_shape = common::vectorize(y.dims());
-  int y_ndim = y_shape.size();
+  int y_ndim = static_cast<int>(y_shape.size());
   auto out_shape = common::vectorize(out.dims());
-  int out_ndim = out_shape.size();
+  int out_ndim = static_cast<int>(out_shape.size());
   int max_ndim = std::max(x_ndim, y_ndim);
   TensorDistAttr out_dist_attr = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr.dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc
index 608794d348541..ef5d93a04533e 100644
--- a/paddle/phi/infermeta/spmd_rules/reduction.cc
+++ b/paddle/phi/infermeta/spmd_rules/reduction.cc
@@ -71,7 +71,7 @@ SpmdInfo ReductionInferSpmdBase(const DistMetaTensor& x,
                                 int reduce_type) {
   // Step0: Verify input args based on reduction logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
@@ -175,8 +175,8 @@ SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
   // Step0: Verify input args based on reduction logic
   auto x_shape = common::vectorize(x.dims());
   auto out_shape = common::vectorize(out.dims());
-  int x_ndim = x_shape.size();
-  int out_ndim = out_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
+  int out_ndim = static_cast<int>(out_shape.size());
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
@@ -240,7 +240,7 @@ SpmdInfo ReductionGradInferSpmd(const DistMetaTensor& x,
 
     for (size_t i = 0; i < axis_value.size(); ++i) {
       if (axis_value[i] < 0) {
-        axis_value[i] += x_dim.size();
+        axis_value[i] += x_dim.size();  // NOLINT
       }
     }
     std::sort(axis_value.begin(), axis_value.end());
diff --git a/paddle/phi/infermeta/spmd_rules/replicated.cc b/paddle/phi/infermeta/spmd_rules/replicated.cc
index 8d9c6d0d5be6c..390117862e04e 100644
--- a/paddle/phi/infermeta/spmd_rules/replicated.cc
+++ b/paddle/phi/infermeta/spmd_rules/replicated.cc
@@ -35,8 +35,8 @@ std::vector<int64_t> GetReplicatedDimsMapping(const int ndim) {
 SpmdInfo ReplicatedInferSpmd(const std::vector<const DistMetaTensor*>& ins,
                              const std::vector<const DistMetaTensor*>& outs) {
   // step1: Build Einsum Notation for input tensor's batch axis
-  int64_t ninputs = ins.size();
-  int64_t noutputs = outs.size();
+  int64_t ninputs = static_cast<int64_t>(ins.size());
+  int64_t noutputs = static_cast<int64_t>(outs.size());
 
   // Step2: Unshard Output's Dims Mapping.
   std::vector<TensorDistAttr> output_dist_attrs;
@@ -94,8 +94,8 @@ SpmdInfo ReplicatedInferSpmdReverse(
     const std::vector<const DistMetaTensor*>& ins,
     const std::vector<const DistMetaTensor*>& outs) {
   // step1: Build Einsum Notation for input tensor's batch axis
-  int64_t ninputs = ins.size();
-  int64_t noutputs = outs.size();
+  int64_t ninputs = static_cast<int64_t>(ins.size());
+  int64_t noutputs = static_cast<int64_t>(outs.size());
 
   // Step2: Unshard Output's Dims Mapping.
   std::vector<TensorDistAttr> output_dist_attrs;
@@ -145,7 +145,7 @@ SpmdInfo ReplicatedInferDynamic(
                                       const std::vector<DistMetaTensor>*>>&
         inputs) {
   std::vector<const DistMetaTensor*> nonnull_inputs;
-  int64_t ninputs = inputs.size();
+  int64_t ninputs = static_cast<int64_t>(inputs.size());
   SpmdInfo spmd_info;
 
   auto build_tensor_dist_attr =
diff --git a/paddle/phi/infermeta/spmd_rules/softmax.cc b/paddle/phi/infermeta/spmd_rules/softmax.cc
index d86db4d41ae23..b6f886a49468a 100644
--- a/paddle/phi/infermeta/spmd_rules/softmax.cc
+++ b/paddle/phi/infermeta/spmd_rules/softmax.cc
@@ -31,7 +31,7 @@ using phi::distributed::auto_parallel::str_join;
 SpmdInfo SoftmaxInferSpmd(const DistMetaTensor& x, int axis) {
   // Step0: Verify input args based on softmax logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
@@ -100,8 +100,8 @@ SpmdInfo SoftmaxInferSpmdReverse(const DistMetaTensor& x,
   // Step0: verify input args based on softmax logic
   auto x_shape = common::vectorize(x.dims());
   auto out_shape = common::vectorize(out.dims());
-  int x_ndim = x_shape.size();
-  int out_ndim = out_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
+  int out_ndim = static_cast<int>(out_shape.size());
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
index ef47b31341a73..5521e1ba2a137 100644
--- a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
+++ b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
@@ -93,7 +93,7 @@ SpmdInfo UnsqueezeInferSpmd(const DistMetaTensor& x,
                             const std::vector<int64_t>& axis) {
   // Step0: Verify input args based on unsqueeze logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
@@ -162,9 +162,9 @@ SpmdInfo UnsqueezeInferSpmdReverse(const DistMetaTensor& x,
                                    const std::vector<int64_t>& axis) {
   // Step0: Verify input args based on unsqueeze logic
   auto x_shape = common::vectorize(x.dims());
-  int x_ndim = x_shape.size();
+  int x_ndim = static_cast<int>(x_shape.size());
   auto out_shape = common::vectorize(out.dims());
-  int out_ndim = out_shape.size();
+  int out_ndim = static_cast<int>(out_shape.size());
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
@@ -217,7 +217,7 @@ SpmdInfo UnsqueezeInferSpmdReverse(const DistMetaTensor& x,
   VLOG(4) << "UnsqueezeInferSpmdReverse: Out shape: [" << str_join(out_shape)
           << "] X shape: [" << str_join(x_shape) << "]";
   VLOG(4) << "Transformation from output to input:";
-  for (int64_t i = 0, n = trans.size(); i < n; i++) {
+  for (int64_t i = 0, n = static_cast<int64_t>(trans.size()); i < n; i++) {
     std::shared_ptr<DimTrans> t = trans[i];
     VLOG(4) << "\tX axis[" << i << "]: " << t->to_string();
   }
diff --git a/paddle/phi/infermeta/spmd_rules/utils.cc b/paddle/phi/infermeta/spmd_rules/utils.cc
index b67d7bd251b1b..336924dd5e951 100644
--- a/paddle/phi/infermeta/spmd_rules/utils.cc
+++ b/paddle/phi/infermeta/spmd_rules/utils.cc
@@ -423,13 +423,14 @@ TensorDistAttr FromPlacements(
     auto& placement = placements[mesh_dim];
     if (placement->is_shard()) {
       auto shard_placement = std::dynamic_pointer_cast<ShardStatus>(placement);
-      dims_mapping[shard_placement->get_axis()] = mesh_dim;
+      dims_mapping[shard_placement->get_axis()] =
+          static_cast<int64_t>(mesh_dim);
     }
     if (placement->is_partial()) {
       auto partial_placement =
           std::dynamic_pointer_cast<PartialStatus>(placement);
       auto reduce_type = partial_placement->get_reduce_type();
-      partial_status[mesh_dim] = reduce_type;
+      partial_status[mesh_dim] = reduce_type;  // NOLINT
     }
   }
   dst_dist_attr.set_dims_mapping(dims_mapping);
@@ -470,7 +471,7 @@ std::vector<int64_t> GetLocalShape(
   for (size_t i = 0; i < n_placement; i++) {
     auto& placement = placements.at(i);
     if (placement->is_shard()) {
-      auto mesh_dim_size = mesh.dim_size(i);
+      auto mesh_dim_size = mesh.dim_size(i);  // NOLINT
       auto shard_dim =
           std::dynamic_pointer_cast<ShardStatus>(placement)->get_axis();
       auto split_size =
diff --git a/paddle/phi/kernels/funcs/jit/gen/blas.cc b/paddle/phi/kernels/funcs/jit/gen/blas.cc
index 8c287efcf5ddd..1e29b7f4953fe 100644
--- a/paddle/phi/kernels/funcs/jit/gen/blas.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/blas.cc
@@ -104,7 +104,7 @@ void VXXJitCode::genCode() {
     } else {
       vmovss(ptr[param3 + offset], xmm_dst);
     }
-    offset += sizeof(float) * block;
+    offset += sizeof(float) * block;  // NOLINT
     rest -= block;
   }
   ret();
diff --git a/paddle/phi/kernels/funcs/jit/gen/gru.cc b/paddle/phi/kernels/funcs/jit/gen/gru.cc
index 599564f431497..33dfaa6cd097c 100644
--- a/paddle/phi/kernels/funcs/jit/gen/gru.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/gru.cc
@@ -39,7 +39,7 @@ void GRUJitCode::genCode() {
     vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
   }
   int offset = 0;
-  int d = num_ * sizeof(float);
+  int d = num_ * sizeof(float);  // NOLINT
   for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     ymm_t ymm_u = ymm_t(1);
     ymm_t ymm_r = ymm_t(2);
diff --git a/paddle/phi/kernels/funcs/jit/gen/lstm.cc b/paddle/phi/kernels/funcs/jit/gen/lstm.cc
index e22a5a2880dff..4943989a50c79 100644
--- a/paddle/phi/kernels/funcs/jit/gen/lstm.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/lstm.cc
@@ -42,7 +42,7 @@ void LSTMJitCode::genCode() {
   }
 
   int offset = 0;
-  int d = num_ * sizeof(float);
+  int d = num_ * sizeof(float);  // NOLINT
   for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     /* gates: W_ch, W_ih, W_fh, W_oh */
     ymm_t ymm_c = ymm_t(0);
diff --git a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
index a7f9e49e32560..f8a2f4fe0201e 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
@@ -34,7 +34,7 @@ void SetInMemDescWithSqueeze2FuseSupport(
   int j = 0;
   for (size_t i = 0; i < x_vec_dims.size(); ++i) {
     if (squeeze2_axes_set.count(i) ||
-        squeeze2_axes_set.count(i - x_vec_dims.size())) {
+        squeeze2_axes_set.count(i - x_vec_dims.size())) {  // NOLINT
       PADDLE_ENFORCE_EQ(
           x_vec_dims[i],
           1,
@@ -68,7 +68,7 @@ void FusedTransposeKernel(const Context& dev_ctx,
   if ((x_dims.size() >= 3) &&
       (phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
        phi::DataLayout::kNHWC)) {
-    int axis_size = axis.size();
+    int axis_size = static_cast<int>(axis.size());
     std::vector<int> formated_axis = axis;
     std::vector<int> count(axis_size, 0);
     for (int i = 0; i < axis_size; i++) {
@@ -85,7 +85,7 @@ void FusedTransposeKernel(const Context& dev_ctx,
 
     phi::DDim out_dims(x_dims);
     for (size_t i = 0; i < axis.size(); i++) {
-      out_dims[i] = x_dims[formated_axis[i]];
+      out_dims[i] = x_dims[formated_axis[i]];  // NOLINT
     }
     out->Resize(out_dims);
   }
diff --git a/paddle/phi/kernels/onednn/concat_grad_kernel.cc b/paddle/phi/kernels/onednn/concat_grad_kernel.cc
index fc36fa4ab0fd8..9563f73f0ba92 100644
--- a/paddle/phi/kernels/onednn/concat_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/concat_grad_kernel.cc
@@ -40,7 +40,7 @@ void ConcatGradKernel(const Context& dev_ctx,
 
   auto out_grad_vec_dims = common::vectorize(out_grad.dims());
 
-  axis = funcs::ComputeAxis(axis, out_grad_vec_dims.size());
+  axis = static_cast<int>(funcs::ComputeAxis(axis, out_grad_vec_dims.size()));
 
   std::vector<int64_t> offset(out_grad_vec_dims.size(), 0);
 
@@ -60,7 +60,7 @@ void ConcatGradKernel(const Context& dev_ctx,
       auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
           grad,
           x_grad_vec_dims,
-          funcs::GetPlainOneDNNFormat(x_grad_vec_dims.size()),
+          funcs::GetPlainOneDNNFormat(static_cast<int>(x_grad_vec_dims.size())),
           dev_ctx.GetPlace());
       auto reorder_p =
           reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
diff --git a/paddle/phi/kernels/onednn/expand_grad_kernel.cc b/paddle/phi/kernels/onednn/expand_grad_kernel.cc
index a8b1beb45832f..7de901df9561d 100644
--- a/paddle/phi/kernels/onednn/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/expand_grad_kernel.cc
@@ -50,7 +50,7 @@ void ExpandGradKernel(const Context& dev_ctx,
 
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
         in_grad,
-        funcs::GetPlainOneDNNFormat(in_grad_vec_dims.size()),
+        funcs::GetPlainOneDNNFormat(static_cast<int>(in_grad_vec_dims.size())),
         dev_ctx.GetPlace());
 
     auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
index 3866a2d06ae45..46a2a7450d41c 100644
--- a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
@@ -51,8 +51,10 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
   for (size_t i = 0; i < x_bd_dims->size() - 2; ++i) {
     (*out_bd_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
   }
-  int h_idx = trans_x ? x_bd_dims->size() - 1 : x_bd_dims->size() - 2;
-  int w_idx = trans_y ? y_bd_dims->size() - 2 : y_bd_dims->size() - 1;
+  int h_idx =
+      trans_x ? x_bd_dims->size() - 1 : x_bd_dims->size() - 2;  // NOLINT
+  int w_idx =
+      trans_y ? y_bd_dims->size() - 2 : y_bd_dims->size() - 1;  // NOLINT
 
   (*out_bd_dims)[x_bd_dims->size() - 2] = (*x_bd_dims)[h_idx];
   (*out_bd_dims)[y_bd_dims->size() - 1] = (*y_bd_dims)[w_idx];
diff --git a/paddle/phi/kernels/onednn/matmul_kernel.cc b/paddle/phi/kernels/onednn/matmul_kernel.cc
index b7b31ff479b30..342fce6f2be02 100644
--- a/paddle/phi/kernels/onednn/matmul_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_kernel.cc
@@ -124,7 +124,7 @@ void MatmulKernel(const Context &dev_ctx,
 
   auto x_dims = common::vectorize(x.dims());
   auto y_dims = common::vectorize(y.dims());
-  int ndims = std::max(x_dims.size(), y_dims.size());
+  int ndims = std::max(x_dims.size(), y_dims.size());  // NOLINT
   ndims = std::max(ndims, 3);
 
   std::vector<int64_t> x_bd_dims(ndims, 1);
@@ -266,7 +266,7 @@ class MulPrimitiveFactory {
     auto scale_out_data = force_fp32_output ? 1.0f : scale_out;
 
     bool is_multi_channel = scale_y_data.size() > 1;
-    int count = is_multi_channel ? scale_y_data.size() : 1;
+    int count = is_multi_channel ? scale_y_data.size() : 1;  // NOLINT
     std::vector<float> output_shift_scale(count);
     for (int i = 0; i < count; i++) {
       if (scale_y_data[i] == 0.0)
diff --git a/paddle/phi/kernels/onednn/slice_grad_kernel.cc b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
index 7f8f6b815b4f0..a929751433ab9 100644
--- a/paddle/phi/kernels/onednn/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
@@ -60,7 +60,7 @@ void SliceGradKernel(const Context& dev_ctx,
   auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
       input_grad,
       dx_dims,
-      funcs::GetPlainOneDNNFormat(dx_dims.size()),
+      funcs::GetPlainOneDNNFormat(static_cast<int>(dx_dims.size())),
       dev_ctx.GetPlace());
   memset(input_grad->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
 
diff --git a/paddle/phi/kernels/onednn/slice_kernel.cc b/paddle/phi/kernels/onednn/slice_kernel.cc
index bd59d61c17e79..aeff6168f047c 100644
--- a/paddle/phi/kernels/onednn/slice_kernel.cc
+++ b/paddle/phi/kernels/onednn/slice_kernel.cc
@@ -69,7 +69,7 @@ void SliceKernel(const Context& dev_ctx,
   auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
       out,
       slice_dims,
-      funcs::GetPlainOneDNNFormat(x_vec_dims.size()),
+      funcs::GetPlainOneDNNFormat(static_cast<int>(x_vec_dims.size())),
       dev_ctx.GetPlace());
 
   auto reorder_p =
diff --git a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
index d8ff4e72c1b11..78a3c4dce6bd3 100644
--- a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
@@ -37,7 +37,7 @@ void SqueezeGradKernel(const Context& dev_ctx,
       dout.mem_desc(), funcs::to_void_cast(dout.data<T>()));
   auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
       dx,
-      funcs::GetPlainOneDNNFormat(dout_vec_dims.size()),
+      funcs::GetPlainOneDNNFormat(static_cast<int>(dout_vec_dims.size())),
       dev_ctx.GetPlace());
   auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
                                                   reorder_src_memory_p);
diff --git a/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc b/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc
index 770093efdacb4..cad204415174b 100644
--- a/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc
+++ b/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc
@@ -764,7 +764,7 @@ class CudnnBNAddReluTester {
     int c = channels_;
     int64_t nhw = ele_count_;
     int32_t c_int32_elems = ((c + 63) & ~63) / 32;
-    int32_t nhw_int32_elems = (nhw + 31) & ~31;
+    int32_t nhw_int32_elems = (static_cast<int32_t>(nhw) + 31) & ~31;
     bitmask.Resize(common::make_ddim({nhw_int32_elems, c_int32_elems, 1}));
 
     auto data_shape = common::vectorize<int>(x.dims());
diff --git a/test/cpp/fluid/memory/buddy_allocator_test.cc b/test/cpp/fluid/memory/buddy_allocator_test.cc
index b399e6fc2ade1..7f4f452d0ebc3 100644
--- a/test/cpp/fluid/memory/buddy_allocator_test.cc
+++ b/test/cpp/fluid/memory/buddy_allocator_test.cc
@@ -173,8 +173,8 @@ TEST(BuddyAllocator, FractionRefillPool) {
   // Max chunk size should be same during allocation
   EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
 
-  size_t alloc =
-      platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
+  size_t alloc = platform::GpuAvailableMemToAlloc() *
+                 FLAGS_fraction_of_gpu_memory_to_use;  // NOLINT
   // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
   // be able to alloc 60% of the remaining GPU
   int* p1 = TestBuddyAllocator(&buddy_allocator,
@@ -184,8 +184,8 @@ TEST(BuddyAllocator, FractionRefillPool) {
   // Max chunk size should be same during allocation
   EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
 
-  alloc =
-      platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
+  alloc = platform::GpuAvailableMemToAlloc() *
+          FLAGS_fraction_of_gpu_memory_to_use;  // NOLINT
   // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
   // be able to alloc 60% of the remaining GPU
   TestBuddyAllocator(&buddy_allocator,
diff --git a/test/cpp/imperative/test_group.cc b/test/cpp/imperative/test_group.cc
index 2243a24dee90d..287e67c9bcff4 100644
--- a/test/cpp/imperative/test_group.cc
+++ b/test/cpp/imperative/test_group.cc
@@ -73,7 +73,7 @@ void GroupConcatSplit(Place place, size_t size) {
 
     std::vector<T> value;
     for (size_t j = 0; j < len; ++j) {
-      value.push_back(static_cast<T>(1.0 * j));
+      value.push_back(static_cast<T>(1.0 * j));  // NOLINT
     }
 
     if (std::is_same<Place, platform::CUDAPlace>::value) {
@@ -89,7 +89,7 @@ void GroupConcatSplit(Place place, size_t size) {
     phi::DenseTensor tmp;
     tmp.ShareDataWith(*tensor).Resize({static_cast<int64_t>(len)});
     group.dense_tensors_.push_back(std::move(tmp));
-    group.all_length_ += len;
+    group.all_length_ += static_cast<int64_t>(len);
     group.dtype_ = framework::TransToProtoVarType(tensor->dtype());
   }
 
diff --git a/test/cpp/inference/api/analyzer_dam_tester.cc b/test/cpp/inference/api/analyzer_dam_tester.cc
index d17f8670adcf4..ea31fe3760b53 100644
--- a/test/cpp/inference/api/analyzer_dam_tester.cc
+++ b/test/cpp/inference/api/analyzer_dam_tester.cc
@@ -193,7 +193,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
   std::vector<PaddleTensor> input_slots;
   int test_batch_num =
-      FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+      FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;  // NOLINT
   LOG(INFO) << "The number of samples to be test: "
             << test_batch_num * FLAGS_batch_size;
   for (int bid = 0; bid < test_batch_num; ++bid) {
diff --git a/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc b/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc
index 311fb0946ca00..12be843475b74 100644
--- a/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc
+++ b/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc
@@ -43,7 +43,7 @@ std::vector<size_t> ReadObjectsNum(std::ifstream &file,
   file.clear();
   file.seekg(offset);
   file.read(reinterpret_cast<char *>(num_objects.data()),
-            total_images * sizeof(size_t));
+            total_images * sizeof(size_t));  // NOLINT
 
   if (file.eof()) LOG(ERROR) << "Reached end of stream";
   if (file.fail()) throw std::runtime_error("Failed reading file.");
diff --git a/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc b/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc
index 2a79ce572dda2..2d0355d361b2d 100644
--- a/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/test/cpp/inference/api/analyzer_lexical_analysis_gru_tester.cc
@@ -49,7 +49,7 @@ std::vector<size_t> ReadSentenceLod(std::ifstream &file,
   file.clear();
   file.seekg(offset);
   file.read(reinterpret_cast<char *>(sentence_lod.data()),
-            total_sentences_num * sizeof(size_t));
+            total_sentences_num * sizeof(size_t));  // NOLINT
 
   if (file.eof()) LOG(ERROR) << "Reached end of stream";
   if (file.fail()) throw std::runtime_error("Failed reading file.");
diff --git a/test/cpp/phi/kernels/test_fused_adam_kernel.cc b/test/cpp/phi/kernels/test_fused_adam_kernel.cc
index 73e1b21ac3120..ec0926508c9e8 100644
--- a/test/cpp/phi/kernels/test_fused_adam_kernel.cc
+++ b/test/cpp/phi/kernels/test_fused_adam_kernel.cc
@@ -445,7 +445,7 @@ static auto GenerateRandomShapes(size_t n, uint64_t low, uint64_t high) {
   std::uniform_int_distribution<uint64_t> dist(low, high);
   std::vector<std::vector<int64_t>> shapes(n);
   for (size_t i = 0; i < n; ++i) {
-    shapes[i].push_back(dist(engine));
+    shapes[i].push_back(static_cast<int64_t>(dist(engine)));
   }
   return shapes;
 }
diff --git a/test/cpp/phi/kernels/test_memcpy_dev_api.cc b/test/cpp/phi/kernels/test_memcpy_dev_api.cc
index 14f5fe15c301b..9a35a1ad99c3f 100644
--- a/test/cpp/phi/kernels/test_memcpy_dev_api.cc
+++ b/test/cpp/phi/kernels/test_memcpy_dev_api.cc
@@ -43,7 +43,7 @@ TEST(DEV_API, memcpy_d2h) {
   auto* x_cpu_data = cpu_ctx->template Alloc<float>(&x_cpu);
 
   for (int i = 0; i < x_cpu.numel(); i++) {
-    x_cpu_data[i] = i;
+    x_cpu_data[i] = static_cast<float>(i);
   }
 
   const auto alloc =

From 9d7883a47040b284fb0c0006932d955345988adc Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Fri, 1 Mar 2024 10:56:51 +0800
Subject: [PATCH 43/55] [clang-tidy] NO.5
 cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays (#61751)

---
 .../distributed/test/graph_node_split_test.cc |  8 +--
 .../fluid/distributed/test/graph_node_test.cc | 10 +--
 .../test/graph_table_sample_test.cc           |  6 +-
 .../distributed/test/sparse_sgd_rule_test.cc  | 66 +++++++++----------
 paddle/fluid/framework/fleet/metrics.cc       |  2 +-
 .../fluid/framework/heter_section_worker.cc   |  2 +-
 paddle/fluid/framework/io/shell.cc            | 20 +++---
 .../fluid/operators/controlflow/pylayer_op.cc | 11 ++--
 paddle/fluid/operators/nccl/nccl_op.cc        |  2 +-
 .../pir/dialect/operator/ir/manual_op.cc      | 16 +++--
 paddle/fluid/platform/collective_helper.cc    |  4 +-
 .../platform/profiler/cpu_utilization.cc      | 13 ++--
 paddle/fluid/pybind/eager_method.cc           | 42 ++++++------
 paddle/fluid/pybind/eager_properties.cc       | 30 ++++-----
 paddle/fluid/pybind/eval_frame_tools.cc       |  2 +-
 .../fusion/cpu/self_dp_attention_kernel.cc    |  4 +-
 test/cpp/fluid/framework/tensor_util_test.cc  |  4 +-
 test/cpp/fluid/math/im2col_test.cc            | 10 +--
 test/cpp/fluid/math/vol2col_test.cc           |  9 +--
 .../api/analysis_predictor_tester.cc          | 12 ++--
 .../api/analyzer_capi_exp_gpu_tester.cc       | 16 ++---
 .../api/analyzer_capi_exp_int_tester.cc       | 16 ++---
 .../api/analyzer_capi_exp_ner_tester.cc       | 23 +++----
 .../api/analyzer_capi_exp_pd_tensor_tester.cc | 22 +++----
 .../analyzer_capi_exp_pd_threads_tester.cc    |  4 +-
 .../inference/api/analyzer_capi_exp_tester.cc |  4 +-
 test/cpp/inference/api/analyzer_dam_tester.cc |  4 +-
 test/cpp/inference/api/analyzer_lac_tester.cc |  2 +-
 test/cpp/inference/api/analyzer_ner_tester.cc |  2 +-
 .../cpp/inference/api/analyzer_rnn1_tester.cc |  8 ++-
 .../api/trt_dynamic_shape_ernie_test.cc       | 14 ++--
 ...rt_dynamic_shape_transformer_prune_test.cc | 28 ++++----
 .../inference/api/trt_rebind_stream_test.cc   |  4 +-
 .../new_executor/standalone_executor_test.cc  |  8 +--
 test/cpp/phi/api/test_from_blob.cc            | 16 ++---
 test/cpp/phi/core/test_custom_kernel.cc       |  2 +-
 test/cpp/phi/kernels/strided_memcpy_test.cc   | 22 ++++---
 test/cpp/pir/tools/test_op.cc                 |  3 +-
 38 files changed, 244 insertions(+), 227 deletions(-)

diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index cb47f3103883f..cbb7741a0a2d3 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -55,7 +55,7 @@ std::vector<std::string> edges = {std::string("37\t45\t0.34"),
                                   std::string("97\t48\t0.34"),
                                   std::string("97\t247\t0.31"),
                                   std::string("97\t111\t0.21")};
-char edge_file_name[] = "edges.txt";
+char edge_file_name[] = "edges.txt";  // NOLINT
 
 std::vector<std::string> nodes = {
     std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
@@ -74,12 +74,12 @@ std::vector<std::string> nodes = {
     std::string("item\t49\ta 0.21"),
     std::string("item\t248\ta 0.21"),
     std::string("item\t113\ta 0.21")};
-char node_file_name[] = "nodes.txt";
+char node_file_name[] = "nodes.txt";  // NOLINT
 
 std::vector<std::string> graph_split = {std::string("0\t97")};
-char graph_split_file_name[] = "graph_split.txt";
+char graph_split_file_name[] = "graph_split.txt";  // NOLINT
 
-void prepare_file(char file_name[], std::vector<std::string> data) {
+void prepare_file(char file_name[], std::vector<std::string> data) {  // NOLINT
   std::ofstream ofile;
   ofile.open(file_name);
   for (auto x : data) {
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 8c29c2bf1df3f..9cc16cb2580f5 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -236,8 +236,8 @@ const char* edges[] = {"37\t45\t0.34",
                        "59\t122\t0.21",
                        "97\t48\t0.34",
                        "97\t247\t0.31",
-                       "97\t111\t0.21"};
-char edge_file_name[] = "edges.txt";
+                       "97\t111\t0.21"};  // NOLINT
+char edge_file_name[] = "edges.txt";      // NOLINT
 
 const char* nodes[] = {"user\t37\ta 0.34\tb 13 14\tc hello\td abc",
                        "user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd",
@@ -254,10 +254,10 @@ const char* nodes[] = {"user\t37\ta 0.34\tb 13 14\tc hello\td abc",
                        "item\t122\ta 0.21",
                        "item\t49\ta 0.21",
                        "item\t248\ta 0.21",
-                       "item\t113\ta 0.21"};
-char node_file_name[] = "nodes.txt";
+                       "item\t113\ta 0.21"};  // NOLINT
+char node_file_name[] = "nodes.txt";          // NOLINT
 
-void prepare_file(char file_name[], bool load_edge) {
+void prepare_file(char file_name[], bool load_edge) {  // NOLINT
   std::ofstream ofile;
   ofile.open(file_name);
   if (load_edge) {
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
index 5489129a070dd..286b19b7070ac 100644
--- a/paddle/fluid/distributed/test/graph_table_sample_test.cc
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -43,7 +43,7 @@ std::vector<std::string> edges = {std::string("37\t45\t0.34"),
                                   std::string("97\t247\t0.31"),
                                   std::string("97\t111\t0.21")};
 // odd id:96 48 122 112
-char edge_file_name[] = "edges.txt";
+char edge_file_name[] = "edges.txt";  // NOLINT
 
 std::vector<std::string> nodes = {
     std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
@@ -62,9 +62,9 @@ std::vector<std::string> nodes = {
     std::string("item\t49\ta 0.21"),
     std::string("item\t248\ta 0.21"),
     std::string("item\t113\ta 0.21")};
-char node_file_name[] = "nodes.txt";
+char node_file_name[] = "nodes.txt";  // NOLINT
 
-void prepare_file(char file_name[], std::vector<std::string> data) {
+void prepare_file(char file_name[], std::vector<std::string> data) {  // NOLINT
   std::ofstream ofile;
   ofile.open(file_name);
   for (auto x : data) {
diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
index 120d8de56f793..a7029d1e8b127 100644
--- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
+++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
@@ -37,8 +37,8 @@ TEST(sparse_value_naive_sgd_test, init_and_update) {
 
   // check init_value for zero
   const int kItemSize = 10;
-  float w[kItemSize];
-  float grad[kItemSize];
+  float w[kItemSize];     // NOLINT
+  float grad[kItemSize];  // NOLINT
   rule.InitValue(w, w + 9, true);
 
   for (float item : w) {
@@ -58,16 +58,16 @@ TEST(sparse_value_naive_sgd_test, init_and_update) {
   for (auto i = 0u; i < kItemSize; ++i) {
     grad[i] = static_cast<float>(i + 1) * 1.0;
   }
-  float label[] = {-0.100000,
-                   -0.200000,
-                   -0.300000,
-                   -0.400000,
-                   -0.500000,
-                   -0.600000,
-                   -0.700000,
-                   -0.800000,
-                   -0.900000,
-                   -1.000000};
+  std::array<float, 10> label = {-0.100000,
+                                 -0.200000,
+                                 -0.300000,
+                                 -0.400000,
+                                 -0.500000,
+                                 -0.600000,
+                                 -0.700000,
+                                 -0.800000,
+                                 -0.900000,
+                                 -1.000000};
   const float* ptr_grad = grad;
   rule.UpdateValue(w, w + 9, ptr_grad);
 
@@ -93,7 +93,7 @@ TEST(downpour_sparse_adagrad_test, test_init_and_update) {
   // check init_value for zero
   const int kValueSize = 11;
   int kEmbSize = 10;
-  float w[kValueSize];
+  float w[kValueSize];  // NOLINT
 
   rule.InitValue(w, w + 10, true);
 
@@ -114,24 +114,24 @@ TEST(downpour_sparse_adagrad_test, test_init_and_update) {
     w[i] = 0;
   }
   w[kEmbSize] = 0;
-  float grad[kEmbSize];
+  float grad[kEmbSize];  // NOLINT
   for (int i = 0; i < kEmbSize; ++i) {
     grad[i] = static_cast<float>(i + 1) * 1.0;
   }
 
   const float* ptr_grad = grad;
   rule.UpdateValue(w, w + 10, ptr_grad);
-  float label[] = {-0.100000,
-                   -0.200000,
-                   -0.300000,
-                   -0.400000,
-                   -0.500000,
-                   -0.600000,
-                   -0.700000,
-                   -0.800000,
-                   -0.900000,
-                   -1.000000,
-                   38.500000};
+  std::array<float, 11> label = {-0.100000,
+                                 -0.200000,
+                                 -0.300000,
+                                 -0.400000,
+                                 -0.500000,
+                                 -0.600000,
+                                 -0.700000,
+                                 -0.800000,
+                                 -0.900000,
+                                 -1.000000,
+                                 38.500000};
   for (auto i = 0u; i < kValueSize; ++i) {
     ASSERT_FLOAT_EQ(w[i], label[i]);
   }
@@ -190,14 +190,14 @@ TEST(downpour_sparse_adam_test, test_init_and_update) {
     grad[i] = static_cast<float>(i + 1) * 1.0;
   }
 
-  float label[] = {-0.0999999642,  -0.099999994, -0.099999994,  -0.099999994,
-                   -0.099999994,   -0.099999994, -0.099999994,  -0.100000001,
-                   -0.100000009,   -0.100000001, 0.100000024,   0.200000048,
-                   0.300000072,    0.400000095,  0.500000119,   0.600000143,
-                   0.700000167,    0.800000191,  0.900000215,   1.00000024,
-                   0.000999987125, 0.0039999485, 0.00899988413, 0.015999794,
-                   0.0249996781,   0.0359995365, 0.0489993691,  0.063999176,
-                   0.0809989572,   0.0999987125, 0.809999943,   0.998001039};
+  std::array<float, 32> label = {
+      -0.0999999642,  -0.099999994, -0.099999994,  -0.099999994, -0.099999994,
+      -0.099999994,   -0.099999994, -0.100000001,  -0.100000009, -0.100000001,
+      0.100000024,    0.200000048,  0.300000072,   0.400000095,  0.500000119,
+      0.600000143,    0.700000167,  0.800000191,   0.900000215,  1.00000024,
+      0.000999987125, 0.0039999485, 0.00899988413, 0.015999794,  0.0249996781,
+      0.0359995365,   0.0489993691, 0.063999176,   0.0809989572, 0.0999987125,
+      0.809999943,    0.998001039};
 
   rule.UpdateValue(value, value + embed_dim, grad);
 
diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc
index 5801860f66566..57fe43fb44624 100644
--- a/paddle/fluid/framework/fleet/metrics.cc
+++ b/paddle/fluid/framework/fleet/metrics.cc
@@ -219,7 +219,7 @@ void BasicAucCalculator::calculate_bucket_error() {
       }
     }
   } else {
-    double* table[2] = {&_table[0][0], &_table[1][0]};
+    double* table[2] = {&_table[0][0], &_table[1][0]};  // NOLINT
     for (int i = 0; i < _table_size; i++) {
       double click = table[1][i];
       double show = table[0][i] + table[1][i];
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index 65902f6c2d0c7..cecfa39d3c16b 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -507,7 +507,7 @@ void HeterSectionWorker::PrintFetchVars() {
   if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) {
     time_t curtime;
     time(&curtime);
-    char mbstr[80];
+    char mbstr[80];  // NOLINT
     std::strftime(
         mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S", std::localtime(&curtime));
     std::stringstream ss;
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index cc893fefbb34f..fa449c1b10867 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -58,7 +58,7 @@ static int close_open_fds_internal() {
     long d_ino = 0;  // NOLINT
     off_t d_off;
     unsigned short d_reclen = 0;  // NOLINT
-    char d_name[256];
+    char d_name[256];             // NOLINT
   };
 
   int dir_fd = -1;
@@ -66,7 +66,7 @@ static int close_open_fds_internal() {
     PADDLE_THROW(platform::errors::Unavailable("Failed to open proc/self/fd."));
     return -1;
   }
-  char buffer[sizeof(linux_dirent)];
+  char buffer[sizeof(linux_dirent)];  // NOLINT
 
   for (;;) {
     int bytes = 0;
@@ -187,8 +187,8 @@ std::shared_ptr<FILE> shell_popen(const std::string& cmd,
 
   std::string real_cmd = "set -o pipefail; " + cmd;
 
-  int pipe_fds[2];
-  if (pipe(pipe_fds) != 0) {
+  std::array<int, 2> pipe_fds;
+  if (pipe(pipe_fds.data()) != 0) {
     *err_no = -1;
     return nullptr;
   }
@@ -300,17 +300,17 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
 
   std::string real_cmd = "set -o pipefail; " + cmd;
 
-  int pipein_fds[2];
-  int pipeout_fds[2];
-  if (pipe(pipein_fds) != 0) {
+  std::array<int, 2> pipein_fds;
+  std::array<int, 2> pipeout_fds;
+  if (pipe(pipein_fds.data()) != 0) {
     return {nullptr, nullptr};
   }
-  if (pipe(pipeout_fds) != 0) {
+  if (pipe(pipeout_fds.data()) != 0) {
     return {nullptr, nullptr};
   }
 
-  int child_pid =
-      shell_p2open_fork_internal(real_cmd.c_str(), pipein_fds, pipeout_fds);
+  int child_pid = shell_p2open_fork_internal(
+      real_cmd.c_str(), pipein_fds.data(), pipeout_fds.data());
 
   close(pipein_fds[1]);
   close(pipeout_fds[0]);
diff --git a/paddle/fluid/operators/controlflow/pylayer_op.cc b/paddle/fluid/operators/controlflow/pylayer_op.cc
index c4b06f326a703..bd83c99a0c62d 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op.cc
@@ -26,11 +26,12 @@ namespace {  // NOLINT
 enum class PyLayerBlockIndex { kFORWARD = 0, kBACKWARD = 1, kNONE = 2 };
 }  // namespace
 
-const char PyLayerOp::kInputs[] = "Input";
-const char PyLayerOp::kOutputs[] = "Out";
-const char PyLayerOp::kScope[] = "Scope";
-const char PyLayerOp::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
-const char PyLayerOp::kBlocks[] = "blocks";
+const char PyLayerOp::kInputs[] = "Input";  // NOLINT
+const char PyLayerOp::kOutputs[] = "Out";   // NOLINT
+const char PyLayerOp::kScope[] = "Scope";   // NOLINT
+const char PyLayerOp::kSkipEagerDeletionVars[] =
+    "skip_eager_deletion_vars";              // NOLINT
+const char PyLayerOp::kBlocks[] = "blocks";  // NOLINT
 
 void PyLayerOp::CreateInterpreter(
     const platform::Place &dev_place,
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 8b06aa653c070..c5a1097e2f157 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-static constexpr char kParallelScopes[] = "parallel_scopes";
+static constexpr char kParallelScopes[] = "parallel_scopes";  // NOLINT
 
 // NCCLinitOp
 class NCCLInitOp : public framework::OperatorBase {
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index ec61f6c7dd88d..b7cebeaf27f47 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -535,8 +535,10 @@ std::vector<pir::Type> AddNArrayOp::InferMeta(
   return argument_outputs;
 }
 
-const char *FusedGemmEpilogueOp::attributes_name[3] = {
-    "trans_x", "trans_y", "activation"};
+const char *FusedGemmEpilogueOp::attributes_name[3] = {  // NOLINT
+    "trans_x",
+    "trans_y",
+    "activation"};
 
 OpInfoTuple FusedGemmEpilogueOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
@@ -849,8 +851,10 @@ std::vector<pir::Type> FusedGemmEpilogueOp::InferMeta(
   return argument_outputs;
 }
 
-const char *FusedGemmEpilogueGradOp::attributes_name[3] = {
-    "trans_x", "trans_y", "activation_grad"};
+const char *FusedGemmEpilogueGradOp::attributes_name[3] = {  // NOLINT
+    "trans_x",
+    "trans_y",
+    "activation_grad"};
 
 OpInfoTuple FusedGemmEpilogueGradOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
@@ -1171,7 +1175,7 @@ std::vector<pir::Type> FusedGemmEpilogueGradOp::InferMeta(
   return argument_outputs;
 }
 
-const char *SplitGradOp::attributes_name[1] = {"axis"};
+const char *SplitGradOp::attributes_name[1] = {"axis"};  // NOLINT
 
 OpInfoTuple SplitGradOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
@@ -1360,7 +1364,7 @@ std::vector<pir::Type> SplitGradOp::InferMeta(
   return argument_outputs;
 }
 
-const char *CreateArrayOp::attributes_name[1] = {"dtype"};
+const char *CreateArrayOp::attributes_name[1] = {"dtype"};  // NOLINT
 
 OpInfoTuple CreateArrayOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {};
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 3444f71639b46..e3be121820684 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -133,7 +133,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
                                         dev_ids.size()));
 
   const int kDevices = dev_ids.size();
-  ncclComm_t comms[kDevices];
+  ncclComm_t comms[kDevices];  // NOLINT
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll(
       comms, dev_ids.size(), dev_ids.data()));
 
@@ -169,7 +169,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
   VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices
           << ", ntrainers: " << ntrainers << ", train_id: " << train_id
           << ", rind_id: " << ring_id;
-  ncclComm_t comms[kDevices];
+  ncclComm_t comms[kDevices];  // NOLINT
   {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
     for (int i = 0; i < kDevices; i++) {
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index e84256f49f078..d373ac32ea6aa 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -24,6 +24,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/cpu_utilization.h"
+#include <array>
 
 namespace paddle {
 namespace platform {
@@ -53,16 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() {
 #elif defined(__linux__)
   start_ = times(&process_tms_start_);
 #define proc_path_size 1024
-  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  static char proc_stat_path[proc_path_size] = "/proc/stat";  // NOLINTf
   FILE *stat_file = fopen(proc_stat_path, "r");
   if (stat_file != nullptr) {
-    char temp_str[200];
+    std::array<char, 200> temp_str;
     uint64_t temp_lu;
     int retval =
         fscanf(stat_file,
                "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
                "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-               temp_str,
+               temp_str.data(),
                &system_tms_start_.tms_utime,
                &nice_time_start_,
                &system_tms_start_.tms_stime,
@@ -98,16 +99,16 @@ void CpuUtilization::RecordEndTimeInfo() {
 #elif defined(__linux__)
   end_ = times(&process_tms_end_);
 #define proc_path_size 1024
-  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  static char proc_stat_path[proc_path_size] = "/proc/stat";  // NOLINT
   FILE *stat_file = fopen(proc_stat_path, "r");
   if (stat_file != nullptr) {
-    char temp_str[200];
+    std::array<char, 200> temp_str;
     uint64_t temp_lu;
     int retval =
         fscanf(stat_file,
                "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
                "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-               temp_str,
+               temp_str.data(),
                &system_tms_end_.tms_utime,
                &nice_time_end_,
                &system_tms_end_.tms_stime,
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6fe07282a2223..16d5fea43fe76 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -603,7 +603,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_reconstruct_from___doc__,
+PyDoc_STRVAR(tensor_reconstruct_from___doc__,  // NOLINT
              R"DOC(reconstruct_from_($self, other/)
 --
 
@@ -786,7 +786,7 @@ Enables this Tensor to have their grad populated during backward(). It is a no-o
         >>> print(y.grad)
         Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=False,
         [1., 1., 1.])
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_retain_grads(TensorObject* self,
                                      PyObject* args,
@@ -1219,7 +1219,7 @@ static PyObject* tensor_method_detach_(TensorObject* self,
   Py_INCREF(reinterpret_cast<PyObject*>(self));
   return reinterpret_cast<PyObject*>(self);
   EAGER_CATCH_AND_THROW_RETURN_NULL
-}
+}  // NOLINT
 
 PyDoc_STRVAR(tensor_method_get_tensor__doc__, R"DOC(get_tensor($self, /)
 --
@@ -1243,7 +1243,7 @@ Returns the underline tensor in the origin Tensor.
           - layout: NCHW
           - dtype: float32
           - data: [1]
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
                                                     PyObject* args,
@@ -2197,7 +2197,7 @@ Returns the total number of non zero elements in input SparseCooTensor/SparseCsr
         >>> coo.nnz()
         3
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_get_non_zero_nums(TensorObject* self,
                                                  PyObject* args,
@@ -2247,7 +2247,7 @@ Returns the indices of non zero elements in input SparseCooTensor.
         [[0, 1, 2],
          [1, 2, 0]])
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
                                                     PyObject* args,
@@ -2290,7 +2290,7 @@ Returns the values of non zero elements in input SparseCooTensor.
         Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
         [1., 2., 3.])
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_get_non_zero_elements(TensorObject* self,
                                                      PyObject* args,
@@ -2344,7 +2344,7 @@ Returns the compressed row index of non zero elements in input SparseCsrTensor.
         Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
         [0, 2, 3, 5])
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_get_non_zero_crows(TensorObject* self,
                                                   PyObject* args,
@@ -2388,7 +2388,7 @@ Returns the column index of non zero elements in input SparseCsrTensor.
         Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
         [1, 3, 2, 0, 1])
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_get_non_zero_cols(TensorObject* self,
                                                  PyObject* args,
@@ -2422,7 +2422,7 @@ Whether the Tensor is a Dense Tensor.
         >>> x = paddle.to_tensor([1.0], stop_gradient=False)
         >>> print(x.is_dense())
         True
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_is_dense(TensorObject* self,
                                         PyObject* args,
@@ -2452,7 +2452,7 @@ Whether the Tensor is a Distributed Tensor.
         >>> x = paddle.to_tensor([1.0], stop_gradient=False)
         >>> print(x.is_dist())
         False
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_is_dist(TensorObject* self,
                                        PyObject* args,
@@ -2489,7 +2489,8 @@ When input is SparseCooTensor/SparseCsrTensor, will return True. When input is D
         >>> coo.is_sparse()
         True
 
-)DOC");
+)DOC");  // NOLINT
+
 static PyObject* tensor_method_is_sparse(TensorObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
@@ -2526,7 +2527,7 @@ When input is SparseCooTensor, will return True. When input is DenseTensor/Spars
         >>> coo.is_sparse_coo()
         True
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_is_sparse_coo(TensorObject* self,
                                              PyObject* args,
@@ -2564,7 +2565,7 @@ When input is SparseCsrTensor, will return True. When input is DenseTensor/Spars
         >>> csr.is_sparse_csr()
         True
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_is_sparse_csr(TensorObject* self,
                                              PyObject* args,
@@ -2607,7 +2608,7 @@ When input is SparseCooTensor, will convert `COO` to `CSR` . When input is Dense
         cols=[1, 2, 0],
         values=[1., 2., 3.])
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_to_sparse_csr(TensorObject* self,
                                              PyObject* args,
@@ -2654,7 +2655,7 @@ Any two type Tensor among DenseTensor/SparseCooTensor/SparseCsrTensor are suppor
         >>> x.is_same_shape(z)
         False
 
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_is_same_shape(TensorObject* self,
                                              PyObject* args,
@@ -2957,7 +2958,7 @@ Returns the address of the first element of current Tensor.
         >>> # doctest: +SKIP('return the address')
         93220864
         >>> # doctest: -SKIP
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_data_ptr(TensorObject* self,
                                  PyObject* args,
@@ -3019,7 +3020,7 @@ Returns the strides of current Tensor.
         >>> y = x[1]
         >>> print(y.get_strides())
         []
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_method_strides(TensorObject* self,
                                        PyObject* args,
@@ -3061,7 +3062,7 @@ If self tensor is already contiguous, this function returns the current Tensor.
         >>> y = y.contiguous()
         >>> print(y)
         Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True, 2)
-)DOC");
+)DOC");  // NOLINT
 
 static PyObject* tensor_contiguous(TensorObject* self,
                                    PyObject* args,
@@ -3110,7 +3111,8 @@ Whether the Tensor is contiguous.
         >>> x = paddle.to_tensor([1, 2, 3])
         >>> y = x[1]
         >>> print(y.is_contiguous())
-)DOC");
+)DOC");  // NOLINT
+
 static PyObject* tensor_is_contiguous(TensorObject* self,
                                       PyObject* args,
                                       PyObject* kwargs) {
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 2a2b94b715abd..fa926618bdf8d 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -40,7 +40,7 @@ namespace pybind {
 
 extern PyTypeObject* p_tensor_type;
 
-PyDoc_STRVAR(tensor_name__doc__,
+PyDoc_STRVAR(tensor_name__doc__,  // NOLINT
              R"DOC(name
 
 Tensor's name.
@@ -75,7 +75,7 @@ PyObject* tensor_properties_get_name(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_type__doc__,
+PyDoc_STRVAR(tensor_type__doc__,  // NOLINT
              R"DOC(type
 
 Tensor's type.
@@ -165,7 +165,7 @@ int tensor_properties_set_name(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
-PyDoc_STRVAR(tensor_stop_gradient__doc__,
+PyDoc_STRVAR(tensor_stop_gradient__doc__,  // NOLINT
              R"DOC(stop_gradient
 
 Tensor's stop_gradient.
@@ -195,7 +195,7 @@ PyObject* tensor_properties_get_stop_gradient(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_data__doc__,
+PyDoc_STRVAR(tensor_data__doc__,  // NOLINT
              R"DOC(data
 
 Tensor's self.
@@ -258,7 +258,7 @@ int tensor_properties_set_data(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
-PyDoc_STRVAR(tensor_grad__doc__,
+PyDoc_STRVAR(tensor_grad__doc__,  // NOLINT
              R"DOC(grad
 
 Tensor's grad Tensor.
@@ -356,7 +356,7 @@ int tensor_properties_set_stop_gradient(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
-PyDoc_STRVAR(tensor_persistable__doc__,
+PyDoc_STRVAR(tensor_persistable__doc__,  // NOLINT
              R"DOC(persistable
 
 Tensor's persistable.
@@ -395,7 +395,7 @@ int tensor_properties_set_persistable(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
-PyDoc_STRVAR(tensor_process_mesh__doc__,
+PyDoc_STRVAR(tensor_process_mesh__doc__,  // NOLINT
              R"DOC(process_mesh
 
 Get process_mesh property from shard tensor.
@@ -441,7 +441,7 @@ PyObject* tensor_properties_get_process_mesh(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_placements__doc__,
+PyDoc_STRVAR(tensor_placements__doc__,  // NOLINT
              R"DOC(placements
 
 Get placements property from shard tensor.
@@ -487,7 +487,7 @@ PyObject* tensor_properties_get_placements(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_num_shard__doc__,
+PyDoc_STRVAR(tensor_num_shard__doc__,  // NOLINT
              R"DOC(num_shard
 
 Tensor's num_shard.
@@ -553,7 +553,7 @@ PyObject* tensor_properties_get_local_shape(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_shape__doc__,
+PyDoc_STRVAR(tensor_shape__doc__,  // NOLINT
              R"DOC(shape
 
 Tensor's shape.
@@ -640,7 +640,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_strides__doc__,
+PyDoc_STRVAR(tensor_strides__doc__,  // NOLINT
              R"DOC(strides
 
 Tensor's strides.
@@ -679,7 +679,7 @@ PyObject* tensor_properties_get_strides(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_offset__doc__,
+PyDoc_STRVAR(tensor_offset__doc__,  // NOLINT
              R"DOC(offset
 
 The address of the first element relative to the offset of the video memory.
@@ -726,7 +726,7 @@ PyObject* tensor_properties_get_offset(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_layout__doc__,
+PyDoc_STRVAR(tensor_layout__doc__,  // NOLINT
              R"DOC(layout
 
 Tensor's memory layout.
@@ -761,7 +761,7 @@ PyObject* tensor_properties_get_layout(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_place__doc__,
+PyDoc_STRVAR(tensor_place__doc__,  // NOLINT
              R"DOC(place
 
 The device Tensor's memory locate.
@@ -828,7 +828,7 @@ PyObject* tensor_properties_get_placements_str(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyDoc_STRVAR(tensor_dtype__doc__,
+PyDoc_STRVAR(tensor_dtype__doc__,  // NOLINT
              R"DOC(dtype
 
 Tensor's data type.
diff --git a/paddle/fluid/pybind/eval_frame_tools.cc b/paddle/fluid/pybind/eval_frame_tools.cc
index da78ce66373e8..504dbc5b9fa01 100644
--- a/paddle/fluid/pybind/eval_frame_tools.cc
+++ b/paddle/fluid/pybind/eval_frame_tools.cc
@@ -34,7 +34,7 @@ class TreeNode {
 
  private:
   int is_prefix;
-  TreeNode* children[256];
+  TreeNode* children[256];  // NOLINT
 };
 
 void TreeNode::clear() {
diff --git a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
index 56107c31d6d9c..0d3189187351c 100644
--- a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
@@ -161,8 +161,8 @@ void sgemm(const float* A,
   int ldc = n;
   float alpha = 1;
   float beta = 0;
-  char ta[] = "N";
-  char tb[] = "N";
+  std::array<char, 2> ta = {"N"};
+  std::array<char, 2> tb = {"N"};
   if (transa) ta[0] = 'T';
   if (transb) tb[0] = 'T';
 
diff --git a/test/cpp/fluid/framework/tensor_util_test.cc b/test/cpp/fluid/framework/tensor_util_test.cc
index 6b9c25750ac07..80140dfdbe1c1 100644
--- a/test/cpp/fluid/framework/tensor_util_test.cc
+++ b/test/cpp/fluid/framework/tensor_util_test.cc
@@ -68,8 +68,8 @@ TEST(TensorCopy, Tensor) {
     int* src_ptr = src_tensor.mutable_data<int>(common::make_ddim({3, 3}),
                                                 platform::CPUPlace());
 
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
+    std::array<int, 9> arr = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr.data(), 9 * sizeof(int));
 
     // CPU phi::DenseTensor to GPU phi::DenseTensor
     auto gpu_place = new platform::CUDAPlace(0);
diff --git a/test/cpp/fluid/math/im2col_test.cc b/test/cpp/fluid/math/im2col_test.cc
index f3925bce95869..36968d7ab68fc 100644
--- a/test/cpp/fluid/math/im2col_test.cc
+++ b/test/cpp/fluid/math/im2col_test.cc
@@ -207,8 +207,8 @@ void testIm2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
       (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
   float* input_ptr = input_tmp.mutable_data<float>(
       {1, input_height, input_width}, paddle::platform::CPUPlace());
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input_ptr, arr, 6 * sizeof(float));
+  std::array<float, 6> arr = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr.data(), 6 * sizeof(float));
 
   auto* place = new paddle::platform::CUDAPlace();
   auto* context = new phi::GPUContext(*place);
@@ -235,8 +235,8 @@ void testIm2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
   im2col(*context, input, dilation, stride, padding, &output_cfo);
   im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
 
-  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
-  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+  std::array<float, 8> out_cfo_data = {0, 1, 1, 2, 3, 4, 4, 5};
+  std::array<float, 8> out_ocf_data = {0, 1, 3, 4, 1, 2, 4, 5};
 
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
@@ -268,7 +268,7 @@ void testIm2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
       col2im;
   phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kOCF, phi::GPUContext, float>
       col2im_ocf;
-  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+  std::array<float, 6> col2im_data = {0, 2, 2, 3, 8, 5};
 
   memset(input_ptr, 0, 6 * sizeof(float));
   if (paddle::platform::is_cpu_place(*place)) {
diff --git a/test/cpp/fluid/math/vol2col_test.cc b/test/cpp/fluid/math/vol2col_test.cc
index 9a6f14c3685cb..12fd0085ee661 100644
--- a/test/cpp/fluid/math/vol2col_test.cc
+++ b/test/cpp/fluid/math/vol2col_test.cc
@@ -187,8 +187,8 @@ void testVol2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
   float* input_ptr =
       input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
                                     paddle::platform::CPUPlace());
-  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input_ptr, arr, 12 * sizeof(float));
+  std::array<float, 12> arr = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr.data(), 12 * sizeof(float));
 
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
@@ -207,7 +207,8 @@ void testVol2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
   phi::funcs::Vol2ColFunctor<phi::GPUContext, float> vol2col;
   vol2col(*context, input, dilations, strides, paddings, &output);
 
-  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  std::array<float, 16> vol_2_col = {
+      0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
@@ -222,7 +223,7 @@ void testVol2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
   }
 
   // Col2Vol test
-  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  std::array<float, 12> col_2_vol = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
   memset(input_ptr, 0, 12 * sizeof(float));
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index 3d87140d9c05a..138063c98adfb 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -56,10 +56,10 @@ TEST(AnalysisPredictor, analysis_off) {
   LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size();
 
   // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
+  std::array<int64_t, 4> input_data = {1, 2, 3, 4};
   PaddleTensor tensor;
   tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
+  tensor.data.Reset(input_data.data(), sizeof(input_data));
   tensor.dtype = PaddleDType::INT64;
 
   std::vector<PaddleTensor> inputs(4, tensor);
@@ -109,10 +109,10 @@ TEST(AnalysisPredictor, analysis_on) {
   ASSERT_EQ(predictor->GetOutputTypes().size(), 1UL);
   ASSERT_EQ(predictor->GetOutputTensorShape().size(), 1UL);
   // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
+  std::array<int64_t, 4> input_data = {1, 2, 3, 4};
   PaddleTensor tensor;
   tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
+  tensor.data.Reset(input_data.data(), sizeof(input_data));
   tensor.dtype = PaddleDType::INT64;
 
   std::vector<PaddleTensor> inputs(4, tensor);
@@ -242,10 +242,10 @@ TEST(AnalysisPredictor, Clone) {
             << framework::GenScopeTreeDebugInfo(root_scope);
 
   // 2. Dummy Input Data
-  int64_t data[4] = {1, 2, 3, 4};
+  std::array<int64_t, 4> input_data = {1, 2, 3, 4};
   PaddleTensor tensor;
   tensor.shape = std::vector<int>({4, 1});
-  tensor.data.Reset(data, sizeof(data));
+  tensor.data.Reset(input_data.data(), sizeof(input_data));
   tensor.dtype = PaddleDType::INT64;
 
   std::vector<PaddleTensor> inputs(4, tensor);
diff --git a/test/cpp/inference/api/analyzer_capi_exp_gpu_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_gpu_tester.cc
index 3ff0d86f59916..61d5966d6d92d 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_gpu_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_gpu_tester.cc
@@ -64,17 +64,17 @@ TEST(PD_Config, gpu_interface) {
   EXPECT_TRUE(trt_enable);
 
   const char* tensor_name = "image";
-  size_t shapes_num[1] = {4};
-  int32_t min_shape[4] = {1, 3, 36, 36};
-  int32_t max_shape[4] = {1, 3, 224, 224};
-  int32_t opt_shape[4] = {1, 3, 224, 224};
-  int32_t* min_shape_ptr = min_shape;
-  int32_t* max_shape_ptr = max_shape;
-  int32_t* opt_shape_ptr = opt_shape;
+  std::array<size_t, 1> shapes_num = {4};
+  std::array<int32_t, 4> min_shape = {1, 3, 36, 36};
+  std::array<int32_t, 4> max_shape = {1, 3, 224, 224};
+  std::array<int32_t, 4> opt_shape = {1, 3, 224, 224};
+  int32_t* min_shape_ptr = min_shape.data();
+  int32_t* max_shape_ptr = max_shape.data();
+  int32_t* opt_shape_ptr = opt_shape.data();
   PD_ConfigSetTrtDynamicShapeInfo(config,
                                   1,
                                   &tensor_name,
-                                  shapes_num,
+                                  shapes_num.data(),
                                   &min_shape_ptr,
                                   &max_shape_ptr,
                                   &opt_shape_ptr,
diff --git a/test/cpp/inference/api/analyzer_capi_exp_int_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_int_tester.cc
index 65d740b229d47..cb3a4db6702c5 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_int_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_int_tester.cc
@@ -45,16 +45,16 @@ void predictor_run() {
   EXPECT_EQ(in_infos->size, 2u);
   PD_IOInfos* out_infos = PD_PredictorGetOutputInfos(predictor);
 
-  int32_t shape_0[4] = {1, 3, 224, 224};
-  float data_0[1 * 3 * 224 * 224] = {0};
+  std::array<int32_t, 4> shape_0 = {1, 3, 224, 224};
+  std::array<float, 1 * 3 * 224 * 224> data_0 = {0};
   PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image");
-  PD_TensorReshape(input_0, 4, shape_0);
-  PD_TensorCopyFromCpuFloat(input_0, data_0);
-  int32_t shape_1[2] = {1, 1};
-  int64_t data_1[1] = {0};
+  PD_TensorReshape(input_0, 4, shape_0.data());
+  PD_TensorCopyFromCpuFloat(input_0, data_0.data());
+  std::array<int32_t, 2> shape_1 = {1, 1};
+  std::array<int64_t, 1> data_1 = {0};
   PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label");
-  PD_TensorReshape(input_1, 2, shape_1);
-  PD_TensorCopyFromCpuInt64(input_1, data_1);
+  PD_TensorReshape(input_1, 2, shape_1.data());
+  PD_TensorCopyFromCpuInt64(input_1, data_1.data());
 
   LOG(INFO) << "Run Inference in CAPI encapsulation. ";
   EXPECT_TRUE(PD_PredictorRun(predictor));
diff --git a/test/cpp/inference/api/analyzer_capi_exp_ner_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_ner_tester.cc
index 98abb7926ccd9..e83ed41fc85bf 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_ner_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_ner_tester.cc
@@ -47,28 +47,29 @@ TEST(PD_PredictorRun, predictor_run) {
   PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
   EXPECT_EQ(input_names->size, 2u);
   LOG(INFO) << "Predictor start run!";
-  PD_Tensor *inputs[2];
+  PD_Tensor *inputs[2];  // NOLINT
   inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
   inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
   LOG(INFO) << "Predictor start run!";
   // inputs[0]: word, use lod memory in stack
-  int32_t shape_0[2] = {11, 1};
-  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
-  size_t lod_layer_0[2] = {0, 11};
+  std::array<int32_t, 2> shape_0 = {11, 1};
+  std::array<int64_t, 11 * 1> data_0 = {
+      12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  std::array<size_t, 2> lod_layer_0 = {0, 11};
   PD_OneDimArraySize layer_0;
   layer_0.size = 2;
-  layer_0.data = lod_layer_0;
+  layer_0.data = lod_layer_0.data();
   PD_OneDimArraySize *layer_0_ptr = &layer_0;
   PD_TwoDimArraySize lod_0;
   lod_0.size = 1;
   lod_0.data = &layer_0_ptr;
-  PD_TensorReshape(inputs[0], 2, shape_0);
-  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorReshape(inputs[0], 2, shape_0.data());
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0.data());
   PD_TensorSetLod(inputs[0], &lod_0);
 
   // inputs[1]: mention, use lod memory in heap
-  int32_t shape_1[2] = {11, 1};
-  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  std::array<int32_t, 2> shape_1 = {11, 1};
+  std::array<int64_t, 11 * 1> data_1 = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
   PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
   lod_1_ptr->size = 1;
   lod_1_ptr->data = new PD_OneDimArraySize *[1];
@@ -78,8 +79,8 @@ TEST(PD_PredictorRun, predictor_run) {
   lod_1_ptr->data[0]->data[0] = 0;
   lod_1_ptr->data[0]->data[1] = 11;
 
-  PD_TensorReshape(inputs[1], 2, shape_1);
-  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorReshape(inputs[1], 2, shape_1.data());
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1.data());
   PD_TensorSetLod(inputs[1], lod_1_ptr);
   // retrieve the lod memory
   delete[] lod_1_ptr->data[0]->data;
diff --git a/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc
index 7a32aefb16d30..40a88d7506dbc 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -45,11 +45,11 @@ void PD_run() {
   PD_Tensor* tensor =
       PD_PredictorGetInputHandle(predictor, input_names->data[0]);
 
-  int32_t shapes[4] = {1, 3, 300, 300};
+  std::array<int32_t, 4> shapes = {1, 3, 300, 300};
   std::vector<float> input(1 * 3 * 300 * 300, 0);
   int32_t size;
   PD_PlaceType place;
-  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorReshape(tensor, 4, shapes.data());
   PD_TensorCopyFromCpuFloat(tensor, input.data());
   PD_TensorDataFloat(tensor, &place, &size);
   PD_TensorMutableDataFloat(tensor, place);
@@ -98,11 +98,11 @@ TEST(PD_Tensor, int32) {
   PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
   PD_Tensor* tensor =
       PD_PredictorGetInputHandle(predictor, input_names->data[0]);
-  int32_t shapes[4] = {1, 3, 300, 300};
+  std::array<int32_t, 4> shapes = {1, 3, 300, 300};
   std::vector<int32_t> input(1 * 3 * 300 * 300, 0);
   int32_t size;
   PD_PlaceType place;
-  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorReshape(tensor, 4, shapes.data());
   PD_TensorCopyFromCpuInt32(tensor, input.data());
   int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size);
   EXPECT_EQ(place, PD_PLACE_CPU);
@@ -129,11 +129,11 @@ TEST(PD_Tensor, int64) {
   PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
   PD_Tensor* tensor =
       PD_PredictorGetInputHandle(predictor, input_names->data[0]);
-  int32_t shapes[4] = {1, 3, 300, 300};
+  std::array<int32_t, 4> shapes = {1, 3, 300, 300};
   std::vector<int64_t> input(1 * 3 * 300 * 300, 0);
   int32_t size;
   PD_PlaceType place;
-  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorReshape(tensor, 4, shapes.data());
   PD_TensorCopyFromCpuInt64(tensor, input.data());
   int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size);
   EXPECT_EQ(place, PD_PLACE_CPU);
@@ -160,12 +160,12 @@ TEST(PD_Tensor, uint8) {
   PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
   PD_Tensor* tensor =
       PD_PredictorGetInputHandle(predictor, input_names->data[0]);
-  int32_t shapes[4] = {1, 3, 300, 300};
-  uint8_t input[1 * 3 * 300 * 300] = {0};
+  std::array<int32_t, 4> shapes = {1, 3, 300, 300};
+  std::array<uint8_t, 1 * 3 * 300 * 300> input = {0};
   int32_t size;
   PD_PlaceType place;
-  PD_TensorReshape(tensor, 4, shapes);
-  PD_TensorCopyFromCpuUint8(tensor, input);
+  PD_TensorReshape(tensor, 4, shapes.data());
+  PD_TensorCopyFromCpuUint8(tensor, input.data());
   uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size);
   EXPECT_EQ(place, PD_PLACE_CPU);
   EXPECT_EQ(size, 1 * 3 * 300 * 300);
@@ -174,7 +174,7 @@ TEST(PD_Tensor, uint8) {
 
   PD_DataType data_type = PD_TensorGetDataType(tensor);
   EXPECT_EQ(data_type, PD_DATA_UINT8);
-  PD_TensorCopyToCpuUint8(tensor, input);
+  PD_TensorCopyToCpuUint8(tensor, input.data());
 
   PD_TensorDestroy(tensor);
   PD_OneDimArrayCstrDestroy(input_names);
diff --git a/test/cpp/inference/api/analyzer_capi_exp_pd_threads_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_pd_threads_tester.cc
index 7cd5ac7e7d482..b06c637c86e47 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_pd_threads_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -84,13 +84,13 @@ void threads_run(int thread_num) {
       reinterpret_cast<pthread_t*>(malloc(thread_num * sizeof(pthread_t)));
   RunParameter* params = reinterpret_cast<RunParameter*>(
       malloc(thread_num * sizeof(RunParameter)));
-  int32_t shapes[4] = {1, 3, 300, 300};
+  std::array<int32_t, 4> shapes = {1, 3, 300, 300};
   float* input =
       reinterpret_cast<float*>(malloc(1 * 3 * 300 * 300 * sizeof(float)));
   memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float));
   for (int i = 0; i < thread_num; ++i) {
     params[i].predictor = PD_PredictorClone(predictor);
-    params[i].shapes = shapes;
+    params[i].shapes = shapes.data();
     params[i].shape_size = 4;
     params[i].input_data = input;
     params[i].out_size = 0;
diff --git a/test/cpp/inference/api/analyzer_capi_exp_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_tester.cc
index 3d5fbd5a0451f..17610f7834039 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_tester.cc
@@ -53,8 +53,8 @@ void predictor_run() {
   const int width = 318;
   float *input = new float[batch_size * channels * height * width]();
 
-  int32_t shape[4] = {batch_size, channels, height, width};
-  PD_TensorReshape(tensor, 4, shape);
+  std::array<int32_t, 4> shape = {batch_size, channels, height, width};
+  PD_TensorReshape(tensor, 4, shape.data());
   PD_TensorCopyFromCpuFloat(tensor, input);
   EXPECT_TRUE(PD_PredictorRun(predictor));
 
diff --git a/test/cpp/inference/api/analyzer_dam_tester.cc b/test/cpp/inference/api/analyzer_dam_tester.cc
index ea31fe3760b53..3770aac10e371 100644
--- a/test/cpp/inference/api/analyzer_dam_tester.cc
+++ b/test/cpp/inference/api/analyzer_dam_tester.cc
@@ -120,8 +120,8 @@ struct DataRecord {
 void PrepareInputs(std::vector<PaddleTensor> *input_slots,
                    DataRecord *data,
                    int batch_size) {
-  PaddleTensor turns_tensor[FLAGS_max_turn_num];
-  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
+  PaddleTensor turns_tensor[FLAGS_max_turn_num];       // NOLINT
+  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];  // NOLINT
   PaddleTensor response_tensor;
   PaddleTensor response_mask_tensor;
   std::string turn_pre = "turn_";
diff --git a/test/cpp/inference/api/analyzer_lac_tester.cc b/test/cpp/inference/api/analyzer_lac_tester.cc
index 9bdb819e5fbd6..ef057227c226c 100644
--- a/test/cpp/inference/api/analyzer_lac_tester.cc
+++ b/test/cpp/inference/api/analyzer_lac_tester.cc
@@ -139,7 +139,7 @@ TEST(Analyzer_LAC, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    const int64_t lac_ref_data[] = {
+    const std::array<int64_t, 47> lac_ref_data = {
         24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
         44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
diff --git a/test/cpp/inference/api/analyzer_ner_tester.cc b/test/cpp/inference/api/analyzer_ner_tester.cc
index 8027603b7eb15..a1bd037640412 100644
--- a/test/cpp/inference/api/analyzer_ner_tester.cc
+++ b/test/cpp/inference/api/analyzer_ner_tester.cc
@@ -120,7 +120,7 @@ void profile(bool memory_load = false) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    const int chinese_ner_result_data[] = {
+    const std::array<int, 11> chinese_ner_result_data = {
         30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25};
     PADDLE_ENFORCE_GT(outputs.size(),
                       0,
diff --git a/test/cpp/inference/api/analyzer_rnn1_tester.cc b/test/cpp/inference/api/analyzer_rnn1_tester.cc
index 14a5aa40a4512..72c53ccbdd815 100644
--- a/test/cpp/inference/api/analyzer_rnn1_tester.cc
+++ b/test/cpp/inference/api/analyzer_rnn1_tester.cc
@@ -191,11 +191,13 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
   minute_tensor->SetLoD({one_batch.lod3});
 
   // assign data
-  float arr0[] = {0, 0};
+  std::array<float, 2> arr0 = {0, 0};
   std::vector<float> zeros(batch_size * 15, 0);
+  std::copy_n(arr0.data(),
+              2,
+              lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
   std::copy_n(
-      arr0, 2, lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
-  std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
+      arr0.data(), 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
   std::copy_n(zeros.begin(),
               zeros.size(),
               cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
diff --git a/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc b/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc
index b28a8eab95d4b..d26946c76856e 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc
@@ -33,22 +33,22 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data, int bs) {
   const int run_seq_len = 128;
   size_t len = run_batch * run_seq_len;
 
-  int32_t i0_bs1[run_seq_len] = {
+  std::array<int32_t, 128> i0_bs1 = {
       1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
       4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
       75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
-  int32_t i1_bs1[run_seq_len] = {
+  std::array<int32_t, 128> i1_bs1 = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  int32_t i2_bs1[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-                                 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-                                 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-                                 30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
-  float i3_bs1[run_seq_len] = {
+  std::array<int32_t, 128> i2_bs1 = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                                     10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                     20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                                     30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  std::array<float, 128> i3_bs1 = {
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
diff --git a/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc b/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc
index 1f6fa900268d6..515330ec11085 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -33,44 +33,44 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
   tmp_input.reserve(run_batch * run_seq_len);
   tmp_four_input.reserve(run_batch * run_seq_len);
 
-  int64_t i0[run_seq_len] = {
+  std::array<int64_t, 128> i0 = {
       1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
       4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
       75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
-  int64_t i1[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
-  int64_t i2[run_seq_len] = {
+  std::array<int64_t, 128> i1 = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                                 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                                 30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  std::array<int64_t, 128> i2 = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  std::array<float, 128> i3 = {
+      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
   // first input
   auto input_t = predictor->GetInputTensor(input_names[0]);
   input_t->Reshape({run_batch, run_seq_len, 1});
-  input_t->copy_from_cpu(i0);
+  input_t->copy_from_cpu(i0.data());
 
   // second input
   auto input_t2 = predictor->GetInputTensor(input_names[1]);
   input_t2->Reshape({run_batch, run_seq_len, 1});
-  input_t2->copy_from_cpu(i1);
+  input_t2->copy_from_cpu(i1.data());
 
   // third input.
   auto input_t3 = predictor->GetInputTensor(input_names[2]);
   input_t3->Reshape({run_batch, run_seq_len, 1});
-  input_t3->copy_from_cpu(i2);
+  input_t3->copy_from_cpu(i2.data());
 
   auto input_t4 = predictor->GetInputTensor(input_names[3]);
   input_t4->Reshape({run_batch, run_seq_len, 1});
-  input_t4->copy_from_cpu(i3);
+  input_t4->copy_from_cpu(i3.data());
 
   ASSERT_TRUE(predictor->ZeroCopyRun());
 
diff --git a/test/cpp/inference/api/trt_rebind_stream_test.cc b/test/cpp/inference/api/trt_rebind_stream_test.cc
index 1f6d5bd8adc68..361335a46be16 100644
--- a/test/cpp/inference/api/trt_rebind_stream_test.cc
+++ b/test/cpp/inference/api/trt_rebind_stream_test.cc
@@ -41,8 +41,8 @@ TEST(ReBindStream_single, use_gpu) {
   auto predictor = paddle_infer::CreatePredictor(config);
   auto x_t = predictor->GetInputHandle("x");
   x_t->Reshape({1, 3, 224, 224});
-  float x_data[3 * 224 * 224] = {0};
-  x_t->CopyFromCpu(x_data);
+  std::array<float, 3 * 224 * 224> x_data = {0};
+  x_t->CopyFromCpu(x_data.data());
   ASSERT_TRUE(predictor->Run());
   cudaDeviceSynchronize();
   ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
diff --git a/test/cpp/new_executor/standalone_executor_test.cc b/test/cpp/new_executor/standalone_executor_test.cc
index 5a2cb41831f7d..67f7aec8c8dfe 100644
--- a/test/cpp/new_executor/standalone_executor_test.cc
+++ b/test/cpp/new_executor/standalone_executor_test.cc
@@ -284,8 +284,8 @@ TEST(InterpreterCore, workqueue_multiplexing) {
   add->SetInput("Y", {"b"});
   add->SetOutput("Out", {"c"});
 
-  float data_a[] = {0, 1, 2, 3};
-  float data_b[] = {0.0, 0.1, 0.2, 0.3};
+  std::array<float, 4> data_a = {0, 1, 2, 3};
+  std::array<float, 4> data_b = {0.0, 0.1, 0.2, 0.3};
 
   phi::DDim dims = common::make_ddim({2, 2});
   const platform::CPUPlace place = platform::CPUPlace();
@@ -293,8 +293,8 @@ TEST(InterpreterCore, workqueue_multiplexing) {
   phi::DenseTensor tensor_a = phi::DenseTensor();
   phi::DenseTensor tensor_b = phi::DenseTensor();
 
-  std::copy_n(data_a, 4, tensor_a.mutable_data<float>(dims, place));
-  std::copy_n(data_b, 4, tensor_b.mutable_data<float>(dims, place));
+  std::copy_n(data_a.data(), 4, tensor_a.mutable_data<float>(dims, place));
+  std::copy_n(data_b.data(), 4, tensor_b.mutable_data<float>(dims, place));
 
   TestShareWorkQueue(
       program, {"a", "b"}, {tensor_a, tensor_b}, {"c"}, {0.0, 1.1, 2.2, 3.3});
diff --git a/test/cpp/phi/api/test_from_blob.cc b/test/cpp/phi/api/test_from_blob.cc
index c51a184e7eb6f..f936a2445ebfc 100644
--- a/test/cpp/phi/api/test_from_blob.cc
+++ b/test/cpp/phi/api/test_from_blob.cc
@@ -84,8 +84,8 @@ using phi::memory_utils::Copy;
 TEST(GetPlaceFromPtr, GPU) {
   using paddle::GetPlaceFromPtr;
 
-  float cpu_data[6];
-  auto cpu_data_place = GetPlaceFromPtr(cpu_data);
+  std::array<float, 6> cpu_data;
+  auto cpu_data_place = GetPlaceFromPtr(cpu_data.data());
   ASSERT_EQ(cpu_data_place, phi::CPUPlace());
   std::cout << "cpu_data_place: " << cpu_data_place << std::endl;
 
@@ -109,7 +109,7 @@ TEST(GetPlaceFromPtr, GPU) {
 
 TEST(from_blob, GPU) {
   // 1. create data
-  float cpu_data[6] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
+  std::array<float, 6> cpu_data = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
   phi::GPUPlace gpu0(0);
   phi::Allocator* allocator = paddle::GetAllocator(gpu0);
   auto gpu_allocation = allocator->Allocate(sizeof(cpu_data));
@@ -119,7 +119,7 @@ TEST(from_blob, GPU) {
   Copy(gpu0,
        gpu_data,
        phi::CPUPlace(),
-       cpu_data,
+       cpu_data.data(),
        sizeof(cpu_data),
        ctx->stream());
 
@@ -137,9 +137,9 @@ TEST(from_blob, GPU) {
 
   // 3.2 check tensor values
   auto* gpu_tensor_data = gpu_tensor.template data<float>();
-  float gpu_tensor_data_cpu[6];
+  std::array<float, 6> gpu_tensor_data_cpu;
   Copy(phi::CPUPlace(),
-       gpu_tensor_data_cpu,
+       gpu_tensor_data_cpu.data(),
        gpu0,
        gpu_tensor_data,
        sizeof(cpu_data),
@@ -155,9 +155,9 @@ TEST(from_blob, GPU) {
   // 3.4 test other API
   auto gpu_tensor_pow = paddle::experimental::pow(gpu_tensor, 2);
   auto* gpu_tensor_pow_data = gpu_tensor_pow.template data<float>();
-  float gpu_tensor_pow_data_cpu[6];
+  std::array<float, 6> gpu_tensor_pow_data_cpu;
   Copy(phi::CPUPlace(),
-       gpu_tensor_pow_data_cpu,
+       gpu_tensor_pow_data_cpu.data(),
        gpu0,
        gpu_tensor_pow_data,
        sizeof(cpu_data),
diff --git a/test/cpp/phi/core/test_custom_kernel.cc b/test/cpp/phi/core/test_custom_kernel.cc
index b4a9e9da61913..d32d6eb2ff4f1 100644
--- a/test/cpp/phi/core/test_custom_kernel.cc
+++ b/test/cpp/phi/core/test_custom_kernel.cc
@@ -214,7 +214,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   auto* dense_y_data = dev_ctx->template Alloc<uint8_t>(dense_y.get());
 
   // dot x,y and result
-  uint8_t sum[2] = {0, 0};
+  std::array<uint8_t, 2> sum = {0, 0};
   for (size_t i = 0; i < 2; ++i) {
     for (size_t j = 0; j < 3; ++j) {
       dense_x_data[i * 3 + j] = (i * 3 + j);
diff --git a/test/cpp/phi/kernels/strided_memcpy_test.cc b/test/cpp/phi/kernels/strided_memcpy_test.cc
index 9bd893bcd10ab..6fb0014956c46 100644
--- a/test/cpp/phi/kernels/strided_memcpy_test.cc
+++ b/test/cpp/phi/kernels/strided_memcpy_test.cc
@@ -79,7 +79,7 @@ TEST(StridedMemcpy, CPUConcat) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(StridedMemcpy, GPUCrop) {
   // clang-format off
-  int src[] = {
+  std::array<int, 15> src = {
       0, 1, 2, 0, 0,
       0, 3, 4, 0, 0,
       0, 0, 0, 0, 0,
@@ -95,11 +95,12 @@ TEST(StridedMemcpy, GPUCrop) {
   auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
 
   int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
+  memory_utils::Copy(
+      gpu0, gpu_src, cpu, src.data(), sizeof(src), ctx->stream());
 
   phi::DDim src_stride({5, 1});
 
-  int dst[4];
+  std::array<int, 4> dst;
   auto dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
   int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
 
@@ -109,7 +110,8 @@ TEST(StridedMemcpy, GPUCrop) {
   phi::funcs::StridedMemcpy<int>(
       *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
 
-  memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  memory_utils::Copy(
+      cpu, dst.data(), gpu0, gpu_dst, sizeof(dst), ctx->stream());
   ctx->Wait();
 
   ASSERT_EQ(1, dst[0]);
@@ -120,7 +122,7 @@ TEST(StridedMemcpy, GPUCrop) {
 
 TEST(StridedMemcpy, GPUConcat) {
   // clang-format off
-  int src[] = {
+  std::array<int, 4> src = {
       1, 2,
       3, 4
   };
@@ -134,9 +136,10 @@ TEST(StridedMemcpy, GPUConcat) {
 
   auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
   int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
+  memory_utils::Copy(
+      gpu0, gpu_src, cpu, src.data(), sizeof(src), ctx->stream());
 
-  int dst[8];
+  std::array<int, 8> dst;
   auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
   int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
 
@@ -149,11 +152,12 @@ TEST(StridedMemcpy, GPUConcat) {
   phi::funcs::StridedMemcpy<int>(
       *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
 
-  memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  memory_utils::Copy(
+      cpu, dst.data(), gpu0, gpu_dst, sizeof(dst), ctx->stream());
   ctx->Wait();
 
   // clang-format off
-  int expect_dst[] = {
+  std::array<int, 8> expect_dst = {
       1, 2, 1, 2,
       3, 4, 3, 4
   };
diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
index de7eaa1fb9972..cbcd78a64c27e 100644
--- a/test/cpp/pir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -35,7 +35,8 @@ void BranchOp::VerifySig() const {
   IR_ENFORCE((*this)->successor(0), "successor[0] can't be nullptr");
 }
 
-const char *Operation1::attributes_name[2] = {"op1_attr1", "op1_attr2"};
+const char *Operation1::attributes_name[2] = {"op1_attr1",
+                                              "op1_attr2"};  // NOLINT
 
 void Operation1::Build(pir::Builder &builder,               // NOLINT
                        pir::OperationArgument &argument) {  // NOLINT

From 4d0be7f12b2c6d6ee629c2bc5d9dd587ae5f8f6e Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Fri, 1 Mar 2024 10:57:48 +0800
Subject: [PATCH 44/55] [clang-tidy] NO.24 enable hicpp-exception-baseclass
 (#61691)

---
 test/cpp/inference/api/analyzer_bert_tester.cc       | 10 +++++++---
 test/cpp/pir/core/ir_program_test.cc                 |  9 ++++++---
 test/cpp/pir/pass/pass_manager_test.cc               | 11 +++++++----
 test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc | 11 ++++++-----
 test/cpp/pir/tools/test_op.cc                        | 10 +++++++---
 5 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/test/cpp/inference/api/analyzer_bert_tester.cc b/test/cpp/inference/api/analyzer_bert_tester.cc
index 0ad6e6cc90298..9f60c72cb0bdf 100644
--- a/test/cpp/inference/api/analyzer_bert_tester.cc
+++ b/test/cpp/inference/api/analyzer_bert_tester.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/phi/core/enforce.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
@@ -159,7 +161,7 @@ void profile(bool use_mkldnn, bool use_bfloat16) {
 std::vector<std::vector<paddle::PaddleTensor>> LoadInputData() {
   if (FLAGS_infer_data.empty()) {
     LOG(ERROR) << "please set input data path";
-    throw "missing input data path";
+    PADDLE_THROW(platform::errors::NotFound("Missing input data path"));
   }
 
   std::ifstream fin(FLAGS_infer_data);
@@ -190,7 +192,8 @@ std::vector<paddle::PaddleTensor> ParseInputStreamToVector(
     const std::string &line) {
   const auto fields = Split<std::string>(line, ';');
 
-  if (fields.size() < 5) throw "invalid input line";
+  if (fields.size() < 5)
+    PADDLE_THROW(platform::errors::Fatal("Invalid input line"));
 
   std::vector<paddle::PaddleTensor> tensors;
 
@@ -228,7 +231,8 @@ AnalysisConfig SetConfig(bool use_mkldnn, bool use_bfloat16) {
 template <typename T>
 paddle::PaddleTensor ParseTensor(const std::string &field) {
   const auto data = Split<std::string>(field, ':');
-  if (data.size() < 2) throw "invalid data field";
+  if (data.size() < 2)
+    PADDLE_THROW(platform::errors::Fatal("Invalid data field"));
 
   std::string shape_str = data[0];
   const auto shape = Split<int>(shape_str, ' ');
diff --git a/test/cpp/pir/core/ir_program_test.cc b/test/cpp/pir/core/ir_program_test.cc
index 0dce6f95c08c7..2957782145a28 100644
--- a/test/cpp/pir/core/ir_program_test.cc
+++ b/test/cpp/pir/core/ir_program_test.cc
@@ -34,8 +34,9 @@
 // paddle/fluid/pir/dialect/CMakeLists.txt.
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/phi/core/enforce.h"
 #include "test/cpp/pir/tools/macros_utils.h"
-
 class AddOp : public pir::Op<AddOp> {
  public:
   using Op::Op;
@@ -51,10 +52,12 @@ class AddOp : public pir::Op<AddOp> {
 };
 void AddOp::VerifySig() {
   if (num_operands() != 2) {
-    throw("The size of inputs must be equal to 2.");
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "The size of inputs must be equal to 2."));
   }
   if (num_results() != 1) {
-    throw("The size of outputs must be equal to 1.");
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "The size of outputs must be equal to 1."));
   }
 }
 void AddOp::Build(pir::Builder &,
diff --git a/test/cpp/pir/pass/pass_manager_test.cc b/test/cpp/pir/pass/pass_manager_test.cc
index f4f4a25bd40b6..2a1c9a4ae4fdd 100644
--- a/test/cpp/pir/pass/pass_manager_test.cc
+++ b/test/cpp/pir/pass/pass_manager_test.cc
@@ -17,12 +17,13 @@
 
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
 // paddle/fluid/pir/dialect/CMakeLists.txt.
-#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
-
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/pir/include/core/builtin_dialect.h"
 #include "paddle/pir/include/core/builtin_op.h"
@@ -79,10 +80,12 @@ class AddOp : public pir::Op<AddOp> {
 };
 void AddOp::VerifySig() {
   if (num_operands() != 2) {
-    throw("The size of inputs must be equal to 2.");
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "The size of inputs must be equal to 2."));
   }
   if (num_results() != 1) {
-    throw("The size of outputs must be equal to 1.");
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "The size of outputs must be equal to 1."));
   }
 }
 void AddOp::Build(pir::Builder &,
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index 9c18ba550e00d..70f0f5ec0760a 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -20,6 +20,7 @@
 #include <sstream>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
@@ -31,8 +32,7 @@
 #include "paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.h"
 #include "paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.h"
 #include "paddle/fluid/pir/transforms/transform_general_functions.h"
-
-#include "paddle/common/enforce.h"
+#include "paddle/fluid/platform/errors.h"
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_dialect.h"
@@ -54,7 +54,6 @@
 #include "paddle/common/ddim.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "test/cpp/pir/tools/macros_utils.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
@@ -85,11 +84,13 @@ void Operation1::VerifySig() {
   auto &attributes = this->attributes();
   if (attributes.count("op2_attr1") == 0 ||
       (!attributes.at("op2_attr1").isa<pir::StrAttribute>())) {
-    throw("Type of attribute: parameter_name is not right.");
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Type of attribute: parameter_name is not right."));
   }
   if (attributes.count("op2_attr2") == 0 ||
       (!attributes.at("op2_attr2").isa<pir::StrAttribute>())) {
-    throw("Type of attribute: parameter_name is not right.");
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Type of attribute: parameter_name is not right."));
   }
 }
 const char *Operation1::attributes_name[attributes_num] = {  // NOLINT
diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
index cbcd78a64c27e..6bfb0767b3d43 100644
--- a/test/cpp/pir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -11,10 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "test/cpp/pir/tools/test_op.h"
 #include "paddle/common/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
-
 namespace test {
 
 void RegionOp::Build(pir::Builder &builder, pir::OperationArgument &argument) {
@@ -50,11 +52,13 @@ void Operation1::VerifySig() const {
   auto &attributes = this->attributes();
   if (attributes.count("op1_attr1") == 0 ||
       !attributes.at("op1_attr1").isa<pir::StrAttribute>()) {
-    throw("Type of attribute: parameter_name is not right.");
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Type of attribute: parameter_name is not right."));
   }
   if (attributes.count("op1_attr2") == 0 ||
       !attributes.at("op1_attr2").isa<pir::StrAttribute>()) {
-    throw("Type of attribute: parameter_name is not right.");
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Type of attribute: parameter_name is not right."));
   }
 }
 

From 3ff45072a154547692594206036e9e50e08d0f15 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Fri, 1 Mar 2024 10:58:24 +0800
Subject: [PATCH 45/55] [clang-tidy] NO.7 bugprone-branch-clone (#61735)

---
 .../fleet_executor/compute_interceptor.cc     |  4 +-
 .../distributed/fleet_executor/dist_model.cc  |  2 +-
 .../custom_operator/custom_operator_utils.cc  |  4 +-
 paddle/fluid/eager/grad_tensor_holder.cc      |  2 +-
 paddle/fluid/framework/data_feed.cc           |  8 +-
 paddle/fluid/framework/data_set.cc            | 14 ++--
 .../framework/details/nan_inf_utils_detail.cc |  2 +-
 paddle/fluid/framework/dist_multi_trainer.cc  |  2 +-
 paddle/fluid/framework/executor.cc            |  2 +-
 .../fluid/framework/heter_section_worker.cc   |  2 +-
 paddle/fluid/framework/infershape_utils.cc    |  4 +-
 .../framework/ir/coalesce_grad_tensor_pass.cc |  2 +-
 .../framework/ir/generate_pass_tester.cc      |  2 +-
 .../framework/ir/identity_op_clean_pass.cc    |  2 +-
 ...ute_propagate_scales_mkldnn_pass_tester.cc |  2 +-
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     |  5 +-
 .../mkldnn/cpu_quantize_squash_pass_tester.cc |  2 +-
 ...t8_scale_calculation_mkldnn_pass_tester.cc |  9 +-
 .../multi_devices_graph_pass.cc               |  6 +-
 .../framework/ir/transfer_layout_elim_pass.cc |  2 +-
 .../garbage_collector/garbage_collector.cc    |  8 +-
 .../no_event_garbage_collector.cc             |  7 +-
 .../new_executor/new_executor_defs.cc         |  2 +-
 .../framework/new_executor/pir_interpreter.cc |  4 +-
 .../new_executor/standalone_executor.cc       |  2 +-
 paddle/fluid/framework/operator.cc            | 15 ++--
 paddle/fluid/framework/section_worker.cc      |  2 +-
 paddle/fluid/imperative/amp_auto_cast.cc      |  5 +-
 .../fluid/imperative/gradient_accumulator.cc  |  4 +-
 paddle/fluid/imperative/layout_autotune.cc    |  2 +-
 paddle/fluid/imperative/nccl_context.cc       |  2 +-
 .../fluid/imperative/partial_grad_engine.cc   |  2 +-
 paddle/fluid/imperative/prepared_operator.cc  |  6 +-
 paddle/fluid/imperative/reducer.cc            |  4 +-
 paddle/fluid/imperative/var_helper.cc         |  3 +-
 .../analysis/ir_passes/lite_subgraph_pass.cc  | 14 ++--
 .../analysis/passes/ir_graph_build_pass.cc    |  2 +-
 .../fluid/inference/api/analysis_predictor.cc | 21 ++---
 paddle/fluid/inference/api/api_impl.cc        |  4 +-
 .../fluid/inference/api/mkldnn_quantizer.cc   |  6 +-
 .../ir_adaptor/translator/op_translator.cc    |  2 +-
 paddle/fluid/jit/property.cc                  |  2 +-
 .../fluid/operators/reader/buffered_reader.cc |  2 +-
 .../fluid/pir/drr/src/ir_operation_factory.cc |  2 +-
 paddle/fluid/platform/place.cc                |  2 -
 paddle/fluid/platform/profiler.cc             | 28 +++----
 paddle/fluid/pybind/eager.cc                  |  2 +-
 paddle/fluid/pybind/eager_functions.cc        |  2 +-
 paddle/fluid/pybind/eager_math_op_patch.cc    |  4 +-
 paddle/fluid/pybind/eager_utils.cc            |  7 +-
 paddle/fluid/pybind/parallel_executor.cc      |  2 +-
 paddle/fluid/pybind/pybind.cc                 |  4 +-
 paddle/phi/core/compat/convert_utils.cc       |  6 +-
 paddle/phi/core/kernel_registry.cc            | 84 ++++++++++++-------
 paddle/phi/infermeta/unary.cc                 | 11 +--
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |  2 +-
 paddle/phi/kernels/cpu/batch_norm_kernel.cc   |  4 +-
 .../kernels/cpu/elementwise_divide_kernel.cc  |  2 +-
 paddle/phi/kernels/cpu/rnn_grad_kernel.cc     |  2 +-
 paddle/phi/kernels/cpu/rnn_kernel.cc          |  2 +-
 paddle/phi/kernels/funcs/sequence_pooling.cc  |  2 +-
 .../kernels/legacy/cpu/elementwise_kernel.cc  |  4 +-
 .../details/fused_broadcast_op_handle_test.cc |  2 +-
 .../imperative/test_gradient_accmulator.cc    |  4 +-
 64 files changed, 192 insertions(+), 185 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 8da1ef87814de..5e2be03108294 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -176,7 +176,7 @@ bool ComputeInterceptor::IsInputReady() {
       flag = flag && (ready_size_map.at(i) != 0);
     }
     if (flag) {
-      if (scope_id_to_finish_flag.empty()) {
+      if (scope_id_to_finish_flag.empty()) {  // NOLINT
         cur_scope_id_ = i;
         return true;
       } else if (scope_id_to_finish_flag.find(i) !=
@@ -303,7 +303,7 @@ void ComputeInterceptor::RunOps() {
                           cur_scope_id_));
   }
 
-  if (!cores_.empty()) {
+  if (!cores_.empty()) {  // NOLINT
     cores_[cur_scope_id_]->Run(/*feed_names=*/{}, /*need_fetch=*/false);
   } else {
     for (auto op : node_->ops()) {
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index a1fd38295319e..4c19069b33705 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -215,7 +215,7 @@ bool DistModel::Init() {
 }
 
 bool DistModel::PreparePlace() {
-  if (config_.place == "GPU") {
+  if (config_.place == "GPU") {  // NOLINT
     place_ = paddle::platform::CUDAPlace(config_.device_id);
   } else if (config_.place == "CPU") {
     place_ = paddle::platform::CPUPlace();
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_utils.cc b/paddle/fluid/eager/custom_operator/custom_operator_utils.cc
index b843e081c29be..a9272053346a7 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_utils.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_utils.cc
@@ -558,7 +558,7 @@ std::vector<std::vector<phi::DDim>> RunInferShapeFn(
     out_dims =
         RunInferShapeFunc(ctx, infer_shape_func, inputs, outputs, inplace_map);
   } else {
-    if (is_forward) {
+    if (is_forward) {  // NOLINT
       out_dims = RunDefaultInferShapeFunc(ctx, inputs, outputs, inplace_map);
     } else {
       out_dims =
@@ -592,7 +592,7 @@ std::vector<std::vector<phi::DataType>> RunInferDtypeFn(
     out_dtypes =
         RunInferDtypeFunc(ctx, infer_dtype_func, inputs, outputs, inplace_map);
   } else {
-    if (is_forward) {
+    if (is_forward) {  // NOLINT
       out_dtypes = RunDefaultInferDtypeFunc(ctx, inputs, outputs, inplace_map);
     } else {
       out_dtypes =
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index dac55f8f5462f..47f41b5a4f93b 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -79,7 +79,7 @@ void GradTensorHolder::CopyValueFromTensor(size_t slot_id,
     // Create new tensor->impl and fill it with 1.0
     if (t.defined()) {
       // Fill 1.0, use full to support complex, one_like don't support it.
-      if (t.is_dense_tensor()) {
+      if (t.is_dense_tensor()) {  // NOLINT
         buffer_[slot_id][rank] =
             paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
       } else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index cec1f664ce0f1..9489d22e34d21 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -1813,7 +1813,7 @@ int PaddleBoxDataFeed::Next() {
     this->batch_size_ = index;
     VLOG(3) << "pv_batch_size_=" << this->batch_size_
             << ", thread_id=" << thread_id_;
-    if (this->batch_size_ != 0) {
+    if (this->batch_size_ != 0) {  // NOLINT
       PutToFeedVec(pv_vec);
     } else {
       VLOG(3) << "finish reading, output_pv_channel_ size="
@@ -2113,7 +2113,7 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
   finish_init_ = true;
   input_type_ = data_feed_desc.input_type();
   size_t pos = pipe_command_.find(".so");
-  if (pos != std::string::npos) {
+  if (pos != std::string::npos) {  // NOLINT
     pos = pipe_command_.rfind('|');
     if (pos == std::string::npos) {
       so_parser_name_ = pipe_command_;
@@ -2129,7 +2129,7 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
   gpu_graph_data_generator_.SetConfig(data_feed_desc);
 #endif
-  if (gpu_graph_mode_) {
+  if (gpu_graph_mode_) {  // NOLINT
     train_mode_ = true;
   } else {
     train_mode_ = data_feed_desc.graph_config().gpu_graph_training();
@@ -2780,7 +2780,7 @@ int SlotRecordInMemoryDataFeed::Next() {
     this->batch_size_ = batch.second;
     VLOG(3) << "batch_size_=" << this->batch_size_
             << ", thread_id=" << thread_id_;
-    if (this->batch_size_ != 0) {
+    if (this->batch_size_ != 0) {  // NOLINT
       PutToFeedVec(&records_[batch.first], this->batch_size_);
     } else {
       VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 0c48c6e1a25ad..20934879c9a13 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -966,7 +966,7 @@ void DatasetImpl<T>::DynamicAdjustChannelNum(int channel_num,
     CHECK(output_channels_data_size == 0);  // NOLINT
     cur_channel = 1;
   }
-  if (cur_channel == 0) {
+  if (cur_channel == 0) {  // NOLINT
     origin_channels = &multi_output_channel_;
     other_channels = &multi_consume_channel_;
     origin_pv_channels = &multi_pv_output_;
@@ -1111,8 +1111,8 @@ void DatasetImpl<T>::CreateReaders() {
     if (input_pv_channel_ != nullptr) {
       readers_[i]->SetInputPvChannel(input_pv_channel_.get());
     }
-    if (cur_channel_ == 0 &&
-        static_cast<size_t>(channel_idx) < multi_output_channel_.size()) {
+    if (cur_channel_ == 0 && static_cast<size_t>(channel_idx) <
+                                 multi_output_channel_.size()) {  // NOLINT
       readers_[i]->SetOutputChannel(multi_output_channel_[channel_idx].get());
       readers_[i]->SetConsumeChannel(multi_consume_channel_[channel_idx].get());
       readers_[i]->SetOutputPvChannel(multi_pv_output_[channel_idx].get());
@@ -1722,7 +1722,7 @@ void MultiSlotDataset::PreprocessChannel(
     const std::set<std::string>& slots_to_replace,
     std::unordered_set<uint16_t>& index_slots) {  // NOLINT
   int out_channel_size = 0;
-  if (cur_channel_ == 0) {
+  if (cur_channel_ == 0) {  // NOLINT
     for (auto& item : multi_output_channel_) {
       out_channel_size += static_cast<int>(item->Size());
     }
@@ -1757,7 +1757,7 @@ void MultiSlotDataset::PreprocessChannel(
       input_channel_->ReadAll(slots_shuffle_original_data_);
     } else {
       CHECK(out_channel_size > 0);  // NOLINT
-      if (cur_channel_ == 0) {
+      if (cur_channel_ == 0) {      // NOLINT
         for (auto& item : multi_output_channel_) {
           std::vector<Record> vec_data;
           item->Close();
@@ -1792,7 +1792,7 @@ void MultiSlotDataset::PreprocessChannel(
   } else {
     // if already have original data for slots shuffle, clear channel
     input_channel_->Clear();
-    if (cur_channel_ == 0) {
+    if (cur_channel_ == 0) {  // NOLINT
       for (auto& item : multi_output_channel_) {
         if (!item) {
           continue;
@@ -1809,7 +1809,7 @@ void MultiSlotDataset::PreprocessChannel(
     }
   }
   int end_size = 0;
-  if (cur_channel_ == 0) {
+  if (cur_channel_ == 0) {  // NOLINT
     for (auto& item : multi_output_channel_) {
       if (!item) {
         continue;
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 551a10f1ccacd..d18cee16b19a6 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -264,7 +264,7 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op,
 
   if (IsSkipOp(op)) return;
 
-  if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
+  if (op_var_nan_inf_white_list().count(op.Type()) == 0) {  // NOLINT
     // NOTE. vname may destruct in the end of this func.
     for (auto& vname : op.OutputVars(true)) {
       auto* var = exec_scope.FindVar(vname);
diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc
index 6fd95267ef6ab..119b6e569cef3 100644
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ b/paddle/fluid/framework/dist_multi_trainer.cc
@@ -157,7 +157,7 @@ void DistMultiTrainer::Run() {
   std::vector<std::future<void>> wait_futures;
   CHECK_EQ(static_cast<int>(pool.size()), thread_num_);
   for (int i = 0; i < thread_num_; ++i) {
-    if (!debug_) {
+    if (!debug_) {  // NOLINT
       wait_futures.emplace_back(
           pool[i]->Run([this, i]() { workers_[i]->TrainFiles(); }));
     } else {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index d935e9ea066bd..fbc2565e755fa 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -99,7 +99,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc,
   while (ancestor_scope->parent()) {
     ancestor_scope = ancestor_scope->parent();
   }
-  if (ancestor_scope != scope) {
+  if (ancestor_scope != scope) {  // NOLINT
     for (auto& var : global_block.AllVars()) {
       if (var->Name() == framework::kEmptyVarName) {
         continue;
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index cecfa39d3c16b..942f776b2323f 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -126,7 +126,7 @@ void HeterSectionWorker::Initialize(const TrainerDesc& desc) {
   bool is_first_stage = (pipeline_stage_ == 0);
   bool is_last_stage = (pipeline_stage_ + 1 == num_pipeline_stages_);
 
-  if (is_first_stage) {
+  if (is_first_stage) {  // NOLINT
     for (auto& op_desc : program_->Block(0).AllOps()) {
       auto op = std::move(OpRegistry::CreateOp(*op_desc));
       auto op_type = op->Type();
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index bcf72be80decb..932e467e23dc0 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -658,7 +658,7 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
         if (attr_ptr && !is_attr_var) {
           auto& attr = *attr_ptr;
           switch (AttrTypeID(attr)) {
-            case framework::proto::AttrType::INTS:
+            case framework::proto::AttrType::INTS:  // NOLINT
               infer_meta_context.EmplaceBackAttr(std::move(
                   phi::IntArray(PADDLE_GET_CONST(std::vector<int32_t>, attr))));
               break;
@@ -836,7 +836,7 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
                       attr_names[i]));
               }
               break;
-            case phi::AttributeType::FLOAT32S:
+            case phi::AttributeType::FLOAT32S:  // NOLINT
               infer_meta_context.EmplaceBackAttr(
                   PADDLE_GET_CONST(std::vector<float>, attr));
               break;
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index 44cb004fec172..966f4ea14967d 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -134,7 +134,7 @@ class CoalesceGradTensorPass : public ir::Pass {
 
     auto &pinned_var_set =
         graph->GetOrInit<details::PinnedVars>(details::kPinnedVars);
-    if (IsUnifiedDtype(p_g_dense_grad, vars_info)) {
+    if (IsUnifiedDtype(p_g_dense_grad, vars_info)) {  // NOLINT
       RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set);
       CoalesceTensors(vars_info, p_g_dense_grad, &result);
     } else {
diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc
index 760e1e8ce4ef8..58a3741a924aa 100644
--- a/paddle/fluid/framework/ir/generate_pass_tester.cc
+++ b/paddle/fluid/framework/ir/generate_pass_tester.cc
@@ -25,7 +25,7 @@ REGISTER_GENERATE_PASS(generate_fc_fuse) {
       VLOG(3) << "exec lambda func.";
       auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out");
       auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out");
-      if (with_relu) {
+      if (with_relu) {  // NOLINT
         return OP_(relu)({"X", ewadd}).Out("Out");
       } else {
         return ewadd;
diff --git a/paddle/fluid/framework/ir/identity_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_op_clean_pass.cc
index ab9df0ae4abee..55316c1b82310 100644
--- a/paddle/fluid/framework/ir/identity_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_op_clean_pass.cc
@@ -70,7 +70,7 @@ FindUselessOpPattern::FindUselessOpPattern(PDPattern* pattern,
               auto in_dtype = x->Op()->GetAttrIfExists<int>("in_dtype");
               auto out_dtype = x->Op()->GetAttrIfExists<int>("out_dtype");
               return in_dtype == out_dtype;
-            } else if (op_type == "c_identity") {
+            } else if (op_type == "c_identity") {  // NOLINT
               return true;
             } else if (op_type == "assign") {
               const auto& in_name = x->Op()->Input("X")[0];
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
index 0f0d385569083..c09a2d1ffbb8d 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
@@ -161,7 +161,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
           begin(wh[i]),
           end(wh[i]),
           wh_tensor->mutable_data<float>(phi::CPUPlace()) + i * wh[0].size());
-    if (type == "gru") {
+    if (type == "gru") {  // NOLINT
       ComputeGruWeightScales(
           graph, &scope, wx_name, wh_name, &var_quant_scales);
     } else {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index bad886ae40cdf..c7e15e24216aa 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -61,7 +61,7 @@ void SetOp(ProgramDesc* prog,
     op->SetOutput("Output", {outputs[0]});
   } else if (type == "pool2d" || type == "fused_transpose" ||
              type == "reshape2" || type == "nearest_interp" ||
-             type == "nearest_interp_v2") {
+             type == "nearest_interp_v2" || type == "dropout") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
   } else if (type == "slice") {
@@ -70,9 +70,6 @@ void SetOp(ProgramDesc* prog,
   } else if (type == "split") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs});
-  } else if (type == "dropout") {
-    op->SetInput("X", {inputs[0]});
-    op->SetOutput("Out", {outputs[0]});
   } else if (type == "fc") {
     op->SetInput("Input", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index d2c6d981c3a2e..89e57108b17ef 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -41,7 +41,7 @@ void SetOp(ProgramDesc* prog,
   if (type != "dropout" && type != "quantize" && type != "dequantize") {
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   }
-  if (type == "pool2d") {
+  if (type == "pool2d") {  // NOLINT
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
     if (!scale.empty()) op->SetAttr("Scale_in", scale[0]);
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
index 44856c086dc93..fde7fb07b9108 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
@@ -70,14 +70,7 @@ ProgramDesc BuildProgramDesc(bool convWithExistingBias,
     }
   }
 
-  if (convWithExistingBias) {
-    SetOp(&prog,
-          "conv2d",
-          "conv",
-          std::vector<std::string>({"c", "weights", "conv_bias"}),
-          std::vector<std::string>({"f"}),
-          scale_weights);
-  } else if (scale_weights.size() > 1) {
+  if (convWithExistingBias || scale_weights.size() > 1) {
     SetOp(&prog,
           "conv2d",
           "conv",
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 295ef57cfdfea..cc20f52180871 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -933,7 +933,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
 
 void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
   if (UseGPU()) {
-    if (strategy_.fuse_broadcast_ops_ == true) {
+    if (strategy_.fuse_broadcast_ops_ == true) {  // NOLINT
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
       for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
@@ -1193,7 +1193,7 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
                                  node->Op()->Type()));
   // Create fetch_barrier op handle to enable output on all devices.
   // **NOTE** fetch_barrier should output variables list same as recv op does.
-  if (node->Op()->Type() == "fetch_barrier") {
+  if (node->Op()->Type() == "fetch_barrier") {  // NOLINT
     result->Get<GraphOps>(kGraphOps).emplace_back(
         new details::FetchBarrierOpHandle(
             result->CreateOpNode(node->Op()), local_scopes_, places_));
@@ -1354,7 +1354,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
         strategy_.reduce_ == details::BuildStrategy::ReduceStrategy::kReduce) {
       return;
     }
-    if (strategy_.fuse_broadcast_ops_ == true) {
+    if (strategy_.fuse_broadcast_ops_ == true) {  // NOLINT
       CreateFusedBroadcastOp(result, bcast_var_name_set_);
     } else {
       for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
diff --git a/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc b/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
index 3a9a2c81889ee..ac3441eb7e737 100644
--- a/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
+++ b/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
@@ -239,7 +239,7 @@ void TransferLayoutElimPass::ApplyImpl(ir::Graph *graph) const {
   FusePassBase::Init(pattern_name, graph);
 
   auto transfer_format = [&](std::string data_format) -> std::string {
-    if (data_format == "NCHW") {
+    if (data_format == "NCHW") {  // NOLINT
       return "NHWC";
     } else if (data_format == "NHWC") {
       return "NCHW";
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
index 166853e2b18da..0d73e2d3fede9 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
@@ -32,14 +32,14 @@ CreateInterpreterCoreGarbageCollector(
     const platform::Place& place,
     const std::vector<std::unique_ptr<InstructionBase>>& vec_instruction) {
   if (platform::is_gpu_place(place)) {
-    if (IsInterpretercoreFastGCEnabled()) {
+    if (IsInterpretercoreFastGCEnabled()) {  // NOLINT
       return std::unique_ptr<InterpreterCoreGarbageCollector>(
           new InterpreterCoreFastGarbageCollector());
     } else {
       return std::unique_ptr<InterpreterCoreGarbageCollector>(
           new InterpreterCoreEventGarbageCollector(vec_instruction));
     }
-  } else if (platform::is_xpu_place(place)) {
+  } else if (platform::is_xpu_place(place)) {  // NOLINT
     // Because there is no multi-stream on XPU device, fast GC can
     // be used.
     // Previously, XPU used no_event GC. But `Wait` in no_event GC
@@ -62,14 +62,14 @@ CreateInterpreterCoreGarbageCollector(
     const platform::Place& place,
     const std::vector<Instruction>& vec_instruction) {
   if (platform::is_gpu_place(place)) {
-    if (IsInterpretercoreFastGCEnabled()) {
+    if (IsInterpretercoreFastGCEnabled()) {  // NOLINT
       return std::unique_ptr<InterpreterCoreGarbageCollector>(
           new InterpreterCoreFastGarbageCollector());
     } else {
       return std::unique_ptr<InterpreterCoreGarbageCollector>(
           new InterpreterCoreEventGarbageCollector(vec_instruction));
     }
-  } else if (platform::is_xpu_place(place)) {
+  } else if (platform::is_xpu_place(place)) {  // NOLINT
     // Because there is no multi-stream on XPU device, fast GC can
     // be used.
     // Previously, XPU used no_event GC. But `Wait` in no_event GC
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc
index 3b7ebc18f36da..d236e740679dd 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc
@@ -49,9 +49,10 @@ void InterpreterCoreNoEventGarbageCollector::Add(
 
   if (var->IsType<phi::DenseTensor>()) {
     Add(var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder(), ctx);
-  } else if (var->IsType<
-                 operators::reader::
-                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+  } else if (
+      var->IsType<
+          operators::reader::
+              OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
   } else if (var->IsType<LoDRankTable>()) {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index b3ec52029bb5b..6c9e5b4a877d5 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -94,7 +94,7 @@ void VariableScope::AddVar(const std::string& name,
     auto id = VarSize();
     name2id_[name] = static_cast<int>(id);
     vec_meta_info_.emplace_back(0, var_desc);
-    if (local_scope_ != nullptr) {
+    if (local_scope_ != nullptr) {  // NOLINT
       var_list_.push_back(local_scope_->FindVar(name));
     } else {
       var_list_.push_back(scope_->FindVar(name));
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 236f18dfb223c..3690c67ac58f4 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -702,7 +702,7 @@ void PirInterpreter::BuildInstruction() {
         continue;
       }
     } else if (op.dialect()->name() == "pd_op") {
-      if (op.isa<paddle::dialect::IfOp>()) {
+      if (op.isa<paddle::dialect::IfOp>()) {  // NOLINT
         vec_instruction_base_.emplace_back(std::make_unique<IfInstruction>(
             op_idx++, place_, &op, value_exe_info_.get(), execution_config_));
         sub_blocks_.insert(
@@ -751,7 +751,7 @@ void PirInterpreter::BuildInstruction() {
       }
       VLOG(6) << "process " << op_name;
 
-      if (op.isa<paddle::dialect::LegacyKernelOp>()) {
+      if (op.isa<paddle::dialect::LegacyKernelOp>()) {  // NOLINT
         CREATE_INSTR(LegacyKernelInstruction);
       } else {
         CREATE_INSTR(PhiKernelInstruction);
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 2bb0a7197774e..74e09a15d6246 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -57,7 +57,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     const std::string& job_type = job->Type();
     std::shared_ptr<ProgramDesc> program = nullptr;
     std::shared_ptr<::pir::Program> ir_program = nullptr;
-    if (FLAGS_enable_pir_api || FLAGS_enable_pir_in_executor) {
+    if (FLAGS_enable_pir_api || FLAGS_enable_pir_in_executor) {  // NOLINT
       ir_program = plan_.IrProgram(job_type);
     } else {
       // NOTE (liuchenghao): std::make_shared will duplicate ProgramDesc object,
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 99ccbbe50d241..55fc19ad2be1c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1754,7 +1754,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   std::string phi_kernel_name;
   if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(type_)) {
     if (kernel_signature_ == nullptr || phi_kernel_ == nullptr) {
-      if (phi::KernelFactory::Instance().HasStructuredKernel(type_)) {
+      if (phi::KernelFactory::Instance().HasStructuredKernel(
+              type_)) {  // NOLINT
         kernel_signature_ =
             std::make_unique<phi::KernelSignature>(type_.c_str());
       } else {
@@ -1989,7 +1990,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        1,
                                        platform::EventRole::kInnerOp);
     if (need_prepare_data_) {
-      if (fallback_to_cpu) {
+      if (fallback_to_cpu) {  // NOLINT
         transfer_scope = PrepareData(scope,
                                      phi_cpu_kernel_key,
                                      &transfered_inplace_vars,
@@ -2278,7 +2279,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
 phi::KernelKey OperatorWithKernel::ChoosePhiKernel(
     const ExecutionContext& ctx) const {
   std::string phi_kernel_name;
-  if (phi::KernelFactory::Instance().HasStructuredKernel(type_)) {
+  if (phi::KernelFactory::Instance().HasStructuredKernel(type_)) {  // NOLINT
     kernel_signature_ = std::make_unique<phi::KernelSignature>(type_.c_str());
   } else {
     kernel_signature_ = std::make_unique<phi::KernelSignature>(
@@ -3104,7 +3105,7 @@ static void SetDnnAttrIntoDeviceContext(
       case proto::AttrType::STRING:
         one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(std::string, attr));
         break;
-      case proto::AttrType::INTS:
+      case proto::AttrType::INTS:  // NOLINT
         one_dnn_ctx->SetDnnAttr(attr_name,
                                 PADDLE_GET_CONST(std::vector<int>, attr));
         break;
@@ -3358,7 +3359,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
       case phi::AttributeType::INT_ARRAY:
         if (attr_iter != Attrs().end()) {
           switch (AttrTypeID(attr_iter->second)) {
-            case proto::AttrType::INTS:
+            case proto::AttrType::INTS:  // NOLINT
               phi_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
                   PADDLE_GET_CONST(std::vector<int32_t>, attr_iter->second))));
               break;
@@ -3497,7 +3498,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
             phi_kernel_context->EmplaceBackAttr(
                 PADDLE_GET_CONST(int64_t, attr_iter->second));
             break;
-          case phi::AttributeType::INT32S:
+          case phi::AttributeType::INT32S:  // NOLINT
             phi_kernel_context->EmplaceBackAttr(
                 PADDLE_GET_CONST(std::vector<int>, attr_iter->second));
             break;
@@ -3536,7 +3537,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
                     attr_names[i]));
             }
             break;
-          case phi::AttributeType::FLOAT32S:
+          case phi::AttributeType::FLOAT32S:  // NOLINT
             phi_kernel_context->EmplaceBackAttr(
                 PADDLE_GET_CONST(std::vector<float>, attr_iter->second));
             break;
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 91d24cc70552c..19e09ab5edf8d 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -238,7 +238,7 @@ void SectionWorker::TrainFiles() {
 #endif
   }  // max_memory_size >= 0
 
-  if (schedule_mode_ == 0) {
+  if (schedule_mode_ == 0) {  // NOLINT
     RunFThenB(gc);
   } else {
     Run1F1B(gc);
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 50df994014004..c2aab61851fb5 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -185,7 +185,7 @@ AmpOperators::GetMutableUnsupportedOps(const phi::DataType& data_type) {
       true,
       phi::errors::InvalidArgument(
           "The data_type mismatch. It should be FLOAT16 or BFLOAT16."));
-  if (data_type == phi::DataType::FLOAT16) {
+  if (data_type == phi::DataType::FLOAT16) {  // NOLINT
     return unsupported_fp16_ops_;
   } else {
     return unsupported_bf16_ops_;
@@ -375,7 +375,8 @@ template <typename VarType>
 NameVarMap<VarType> AutoCastInputs(const std::string& op_type,
                                    const NameVarMap<VarType>& ins) {
   NameVarMap<VarType> new_ins(ins);
-  if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
+  if (AmpOperators::Instance().GetMutableAllowOps()->count(
+          op_type)) {  // NOLINT
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
       if ((op_type == "batch_norm" || op_type == "layer_norm" ||
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 8f4dfbbcdc977..d9c91a4c6b0a0 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -518,7 +518,7 @@ void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
 static platform::Place GetPlaceOfVar(
     const std::shared_ptr<VariableWrapper>& var) {
   platform::Place place;
-  if (var->Var().IsType<phi::DenseTensor>()) {
+  if (var->Var().IsType<phi::DenseTensor>()) {  // NOLINT
     place = var->Var().Get<phi::DenseTensor>().place();
   } else if (var->Var().IsType<phi::SelectedRows>()) {
     place = var->Var().Get<phi::SelectedRows>().place();
@@ -735,7 +735,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
       }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      if (paddle::platform::is_gpu_place(place)) {
+      if (paddle::platform::is_gpu_place(place)) {  // NOLINT
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
           if (!var_info.var->Var().IsType<phi::SelectedRows>()) {
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index 006021488aa57..7836572b0c426 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -145,7 +145,7 @@ LayoutAutotuneGuard::LayoutAutotuneGuard(std::shared_ptr<Tracer> tracer,
 }
 
 LayoutAutotuneGuard::~LayoutAutotuneGuard() {
-  if (pre_layout_autotune_) {
+  if (pre_layout_autotune_) {  // NOLINT
     tracer_->EnableLayoutAutoTune();
   } else {
     tracer_->DisableLayoutAutoTune();
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index d70d40808f915..3ed9b97bfc362 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -67,7 +67,7 @@ void NCCLParallelContext::Init() {
   std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(strategy_.nrings_);
 
-  if (strategy_.local_rank_ == 0) {
+  if (strategy_.local_rank_ == 0) {  // NOLINT
     // generate the unique ncclid on the root worker
     for (auto &nccl_id : nccl_ids) {
       platform::dynload::ncclGetUniqueId(&nccl_id);
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 0a5d44a1e1e57..47a3605ecc7be 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -366,7 +366,7 @@ class GradientAccumulationInfo {
       if (!grad_var_) {
         grad_var_ = std::make_shared<VarBase>(true, mapped_grad_var_->Name());
         grad_var_->SetOverriddenStopGradient(false);
-        if (sort_gradient_) {
+        if (sort_gradient_) {  // NOLINT
           accumulator_ = std::make_unique<SortedGradientAccumulator>(
               grad_var_->SharedVar().get());
         } else {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8129ea244f489..a60c81a4c22d9 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -660,7 +660,7 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const NameVarMap<VarBase>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_phi_kernel_) {
+  if (run_phi_kernel_) {  // NOLINT
     PreparedOpRunPtImpl<VarBase>(op_,
                                  kernel_key_,
                                  arg_map_fn_,
@@ -692,7 +692,7 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const NameVarMap<VariableWrapper>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_phi_kernel_) {
+  if (run_phi_kernel_) {  // NOLINT
     PreparedOpRunPtImpl<VariableWrapper>(op_,
                                          kernel_key_,
                                          arg_map_fn_,
@@ -724,7 +724,7 @@ void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
                      const NameVarMap<egr::EagerVariable>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_phi_kernel_) {
+  if (run_phi_kernel_) {  // NOLINT
     PreparedOpRunPtImpl<egr::EagerVariable>(op_,
                                             kernel_key_,
                                             arg_map_fn_,
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 461c2d3ff4bb8..5b8dc28d03111 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -227,7 +227,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
-  if (platform::is_gpu_place(place)) {
+  if (platform::is_gpu_place(place)) {  // NOLINT
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
                           dense_tensors_,
@@ -263,7 +263,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
 
 void Group::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
-  if (platform::is_gpu_place(place)) {
+  if (platform::is_gpu_place(place)) {  // NOLINT
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
                          &dense_contents_,
diff --git a/paddle/fluid/imperative/var_helper.cc b/paddle/fluid/imperative/var_helper.cc
index bafea5a720d3a..9561962935ffe 100644
--- a/paddle/fluid/imperative/var_helper.cc
+++ b/paddle/fluid/imperative/var_helper.cc
@@ -50,7 +50,8 @@ void InitializeVariable(paddle::framework::Variable *var,
     var->GetMutable<phi::SelectedRows>();
   } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) {
     var->GetMutable<paddle::framework::FeedList>();
-  } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) {
+  } else if (var_type ==
+             paddle::framework::proto::VarType::FETCH_LIST) {  // NOLINT
     var->GetMutable<paddle::framework::FetchList>();
   } else if (var_type == paddle::framework::proto::VarType::STEP_SCOPES) {
     var->GetMutable<std::vector<paddle::framework::Scope *>>();
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index f8a4d4d15af72..dcdf8405cc2f8 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -271,7 +271,7 @@ void LiteSubgraphPass::SetUpEngine(
       Get<std::vector<std::string>>("nnadapter_model_cache_token");
 
   lite_api::TargetType target_type = TARGET(kX86);
-  if (use_gpu) {
+  if (use_gpu) {  // NOLINT
     target_type = TARGET(kCUDA);
   } else if (use_xpu) {
     target_type = TARGET(kXPU);
@@ -417,13 +417,11 @@ void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {
   auto& lite_ops_filter = Get<std::vector<std::string>>("lite_ops_filter");
 
   auto teller = [&lite_ops_filter](const Node* node) {
-    if (!node->IsOp() || !node->Op())
-      return false;
-    else if (node->Op()->Type() == "feed" || node->Op()->Type() == "fetch")
-      return false;
-    else if (std::find(lite_ops_filter.begin(),
-                       lite_ops_filter.end(),
-                       node->Op()->Type()) != lite_ops_filter.end())
+    if (!node->IsOp() || !node->Op() || node->Op()->Type() == "feed" ||
+        node->Op()->Type() == "fetch" ||
+        std::find(lite_ops_filter.begin(),
+                  lite_ops_filter.end(),
+                  node->Op()->Type()) != lite_ops_filter.end())
       return false;
     return inference::lite::OpTeller::Global().Tell(node->Op()->Type(),
                                                     *node->Op());
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 8106dfbb9e6aa..ea97be8f90a60 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -121,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
     bool model_from_memory,
     bool skip_load_params) {
   framework::Executor exe(place);
-  if (!model_from_memory) {
+  if (!model_from_memory) {  // NOLINT
     return Load(&exe, scope, program_path, params_path, !skip_load_params);
   } else {
     return LoadFromMemory(&exe, scope, program_path, params_path);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 35ff7eb608b6a..9b05b9f78572e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1301,7 +1301,7 @@ bool AnalysisPredictor::LoadConverterConfig(
       int64_t key = std::stoll(one_line[0]);
       for (size_t i = 1; i < one_line.size(); ++i) {
         int64_t val = std::stoll(one_line[i]);
-        if (ring_to_rank) {
+        if (ring_to_rank) {  // NOLINT
           if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) {
             ring_id_to_ranks->insert({key, std::vector<int64_t>()});
           }
@@ -1441,7 +1441,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     HookCollectShapeRangeInfo();
   }
 
-  if (config_.new_executor_enabled()) {
+  if (config_.new_executor_enabled()) {  // NOLINT
     executor_->RunInterpreterCore();
   } else {
     // Run the inference program
@@ -1514,7 +1514,7 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
     HookCollectShapeRangeInfo();
   }
 
-  if (config_.new_executor_enabled()) {
+  if (config_.new_executor_enabled()) {  // NOLINT
     executor_->RunInterpreterCore();
   } else {
     // Run the inference program
@@ -1937,7 +1937,7 @@ void AnalysisPredictor::PrepareArgument() {
           if (deleted_passes.count(pass)) continue;
           pass_builder->AppendPass(pass);
         }
-      } else if (config_.use_xpu()) {
+      } else if (config_.use_xpu()) {  // NOLINT
         // All passes support fp16. Not reset pass_builder.
       } else if (config_.use_custom_device()) {
         // All passes support fp16. Not reset pass_builder.
@@ -2060,7 +2060,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 #else
   if (config_.mkldnn_enabled() ||
       (config_.tensorrt_engine_enabled() &&
-       config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8)) {
+       config_.tensorrt_precision_mode_ ==
+           AnalysisConfig::Precision::kInt8)) {  // NOLINT
     argument_->PartiallyRelease();
   } else {
     argument_.reset(nullptr);
@@ -2354,7 +2355,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   framework::Scope *scope = nullptr;
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
-  if (config_.dist_config().use_dist_model()) {
+  if (config_.dist_config().use_dist_model()) {  // NOLINT
     scope = scope_.get();
   } else {
     scope = executor_->GetScope();
@@ -2405,7 +2406,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
   framework::Scope *scope;  // NOLINT
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
-  if (config_.dist_config().use_dist_model()) {
+  if (config_.dist_config().use_dist_model()) {  // NOLINT
     scope = scope_.get();
   } else {
     scope = executor_->GetScope();
@@ -2455,7 +2456,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun(bool switch_stream) {
   inference::DisplayMemoryInfo(place_, "before run");
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
-  if (config_.dist_config().use_dist_model()) {
+  if (config_.dist_config().use_dist_model()) {  // NOLINT
     VLOG(3) << "ZeroCopyRun will use the fleet executor.";
     fleet_exe_->Run(config_.dist_config().carrier_id());
     return true;
@@ -2514,7 +2515,7 @@ bool AnalysisPredictor::ZeroCopyRun(bool switch_stream) {
   }
 #endif
 
-  if (config_.new_executor_enabled()) {
+  if (config_.new_executor_enabled()) {  // NOLINT
     executor_->RunInterpreterCore({}, false, switch_stream);
   } else {
     executor_->Run();
@@ -2780,7 +2781,7 @@ void AnalysisPredictor::StatisticShapeRangeInfo() {
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
   std::string filename;
-  if (!config_.model_dir().empty()) {
+  if (!config_.model_dir().empty()) {  // NOLINT
     filename = config_.model_dir() + "/__model__";
   } else if (!config_.prog_file().empty()) {
     // All parameters are saved in a single file.
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c8eaa1c3ebd1e..1ae582feb4acf 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -101,7 +101,7 @@ bool NativePaddlePredictor::Init(
   executor_ = std::make_unique<paddle::framework::Executor>(place_);
 
   // Initialize the inference program
-  if (!config_.model_dir.empty()) {
+  if (!config_.model_dir.empty()) {  // NOLINT
     // Parameters are saved in separate files sited in
     // the specified `dirname`.
     inference_program_ = paddle::inference::Load(
@@ -286,7 +286,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     }
     input.set_lod(lod);
     int idx = -1;
-    if (config_.specify_input_name) {
+    if (config_.specify_input_name) {  // NOLINT
       idx = static_cast<int>(feed_names_[inputs[i].name]);
     } else {
       idx = PADDLE_GET_CONST(int, feeds_[i]->GetAttr("col"));
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 46ae4624ea9e8..76222b84d4624 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -78,7 +78,7 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForRNNWeights(
     check_var(wh_var, wh_name);
     phi::DenseTensor* wx_tensor = wx_var->GetMutable<phi::DenseTensor>();
     phi::DenseTensor* wh_tensor = wh_var->GetMutable<phi::DenseTensor>();
-    if (gru) {
+    if (gru) {  // NOLINT
       scales_[wx_name] = GetMaxChGRUScalingFactor(*wx_tensor, *wh_tensor);
     } else {
       scales_[wx_name] = GetMaxChLSTMScalingFactor(*wx_tensor, *wh_tensor);
@@ -215,6 +215,7 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
 
   switch (rule) {
     case ScaleAlgo::MAX:
+    case ScaleAlgo::KL:
       scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
       break;
     case ScaleAlgo::MAX_CH:
@@ -227,9 +228,6 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
                                                 is_unsigned,
                                                 /*is_transposed*/ true);
       break;
-    case ScaleAlgo::KL:
-      scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
-      break;
     default:
       throw std::runtime_error(
           "MkldnnQuantizer: Unexpected ScaleAlgo specified.");
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index b7081609f2f90..bf5acda9c1bbd 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1903,7 +1903,7 @@ struct FillConstant2FullTranscriber : public OpTranscriber {
       }
     }
     switch (place_type) {
-      case -1:
+      case -1:  // NOLINT
         attribute_map["place"] = paddle::dialect::PlaceAttribute::get(
             ctx, phi::Place(phi::AllocationType::UNDEFINED));
         break;
diff --git a/paddle/fluid/jit/property.cc b/paddle/fluid/jit/property.cc
index 687468df83a3d..37c426bb5401b 100644
--- a/paddle/fluid/jit/property.cc
+++ b/paddle/fluid/jit/property.cc
@@ -99,7 +99,7 @@ std::unordered_map<std::string, std::shared_ptr<Variable>> Property::Values() {
         case ValueProto::STRING:
           *var->GetMutable<paddle::framework::String>() = GetString(n);
           break;
-        case ValueProto::FLOATS:
+        case ValueProto::FLOATS:  // NOLINT
           *var->GetMutable<std::vector<float>>() = GetFloats(n);
           break;
         case ValueProto::INTS:
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b73ffe4319be7..cc5034c86f90f 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -380,7 +380,7 @@ void BufferedReader::ReadNextImpl(paddle::framework::LoDTensorArray *out) {
     return;
   }
 
-  if (platform::is_gpu_place(place_)) {
+  if (platform::is_gpu_place(place_)) {  // NOLINT
     *out = std::move(cuda_buffer_[i]);
   } else if (platform::is_xpu_place(place_)) {
     *out = std::move(xpu_buffer_[i]);
diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
index f792ccbdaff92..61c12c281e139 100644
--- a/paddle/fluid/pir/drr/src/ir_operation_factory.cc
+++ b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
@@ -81,7 +81,7 @@ pir::Attribute CreateIrAttribute(const std::any& obj) {
         std::any_cast<phi::DataType>(obj));
   } else if (obj.type() == typeid(phi::Place)) {
     return IrAttrbuteCreator<phi::Place>()(std::any_cast<phi::Place>(obj));
-  } else if (obj.type() == typeid(std::vector<int32_t>)) {
+  } else if (obj.type() == typeid(std::vector<int32_t>)) {  // NOLINT
     return IrAttrbuteCreator<std::vector<int32_t>>()(
         std::any_cast<std::vector<int32_t>>(obj));
   } else if (obj.type() == typeid(std::vector<int64_t>)) {
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 118ba7d6b782c..df66cc63e3986 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -62,8 +62,6 @@ bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
     if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
-    } else if (is_xpu_place(p1) || is_ipu_place(p1) || is_custom_place(p1)) {
-      return p1 == p2;
     } else {
       return p1 == p2;
     }
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 816ae57ff4c06..2630b36d0e8ad 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -200,8 +200,8 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
         RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
       } else {
-        current_allocated =
-            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        current_allocated = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+            Allocated, place.GetDeviceId());  // NOLINT
         peak_allocated =
             DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
@@ -283,10 +283,10 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
         RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
       } else {
-        current_reserved =
-            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-        peak_reserved =
-            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        current_reserved = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+            Reserved, place.GetDeviceId());  // NOLINT
+        peak_reserved = DEVICE_MEMORY_STAT_PEAK_VALUE(
+            Reserved, place.GetDeviceId());  // NOLINT
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
             current_reserved;
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
@@ -366,10 +366,10 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
         RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
       } else {
-        current_allocated =
-            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-        peak_allocated =
-            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        current_allocated = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+            Allocated, place.GetDeviceId());  // NOLINT
+        peak_allocated = DEVICE_MEMORY_STAT_PEAK_VALUE(
+            Allocated, place.GetDeviceId());  // NOLINT
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
             current_allocated;
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
@@ -449,10 +449,10 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
         RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
       } else {
-        current_reserved =
-            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-        peak_reserved =
-            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        current_reserved = DEVICE_MEMORY_STAT_CURRENT_VALUE(
+            Reserved, place.GetDeviceId());  // NOLINT
+        peak_reserved = DEVICE_MEMORY_STAT_PEAK_VALUE(
+            Reserved, place.GetDeviceId());  // NOLINT
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
             current_reserved;
         RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 3cb3ccf964ec8..00b6ba994233f 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -442,7 +442,7 @@ Placements ParsePlacementsArgs(
   Placements placements;
   const std::string& placements_key = "placements";
 
-  if (kw_order_map[placements_key] <= args_num) {
+  if (kw_order_map[placements_key] <= args_num) {  // NOLINT
     placements = CastPyArg2VectorOfPlacement(
         PyTuple_GET_ITEM(args, kw_order_map[placements_key] - 1),
         kw_order_map[placements_key] - 1);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 0a72208f36ccc..812be85b653af 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -644,7 +644,7 @@ PyObject* eager_api_run_custom_op(PyObject* self,
     } else if (attr_type_str == "std::string") {
       ctx.EmplaceBackAttr(
           CastPyArg2AttrString(obj, attr_start_idx + i));  // NOLINT
-    } else if (attr_type_str == "std::vector<int>") {
+    } else if (attr_type_str == "std::vector<int>") {      // NOLINT
       ctx.EmplaceBackAttr(CastPyArg2VectorOfInt(obj, attr_start_idx + i));
     } else if (attr_type_str == "std::vector<float>") {
       ctx.EmplaceBackAttr(CastPyArg2VectorOfFloat(obj, attr_start_idx + i));
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 21fd549cb0b2d..17b36e9237e78 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -818,10 +818,10 @@ static PyObject* tensor__rdiv__method(TensorObject* self,
   bool has_other_double = false;
   if (PyFloat_Check(other_obj) || PyCheckInteger(other_obj) ||
       IsNumpyType(other_obj)) {
-    if (PyFloat_Check(other_obj)) {
+    if (PyFloat_Check(other_obj)) {  // NOLINT
       other_double = CastPyArg2Double(other_obj, "__rdiv__", 0);
       has_other_double = true;
-    } else if (PyCheckInteger(other_obj) || IsNumpyType(other_obj)) {
+    } else if (PyCheckInteger(other_obj) || IsNumpyType(other_obj)) {  // NOLINT
       other_double = CastPyArg2Double(other_obj, "__rdiv__", 0);
       has_other_double = true;
     }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c6a2db061594b..851e498bac8b3 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -647,7 +647,7 @@ std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
 
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
   platform::Place place;
-  if (PyObject_TypeCheck(obj, g_place_pytype)) {
+  if (PyObject_TypeCheck(obj, g_place_pytype)) {  // NOLINT
     place = ::pybind11::handle(obj).cast<platform::Place>();
   } else if (PyObject_TypeCheck(obj, g_cudaplace_pytype)) {
     place = ::pybind11::handle(obj).cast<platform::CUDAPlace>();
@@ -761,7 +761,8 @@ std::vector<phi::DenseTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
             i));
       }
     }
-  } else if (PyObject_TypeCheck(obj, g_framework_lodtensorarray_pytype)) {
+  } else if (PyObject_TypeCheck(obj,
+                                g_framework_lodtensorarray_pytype)) {  // NOLINT
     for (auto& tensor :
          (::pybind11::handle(obj).cast<framework::LoDTensorArray>())) {
       result.emplace_back(tensor);
@@ -788,7 +789,7 @@ using phi::distributed::Shard;
 Placements CastPyArg2VectorOfPlacement(PyObject* obj, ssize_t arg_pos) {
   Placements result;
   auto check_and_emplace = [&](PyObject* item, ssize_t i) {
-    if (PyObject_TypeCheck(item, g_placement_shard_pytype)) {
+    if (PyObject_TypeCheck(item, g_placement_shard_pytype)) {  // NOLINT
       result.emplace_back(
           std::make_shared<Shard>(::pybind11::handle(item).cast<Shard>()));
     } else if (PyObject_TypeCheck(item, g_placement_replicated_pytype)) {
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 9060e158c9ed9..1b567fb51ba1e 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -931,7 +931,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {
+            if (self.memory_optimize_) {  // NOLINT
               return py::cast(self.memory_optimize_.get());
             } else {
               return py::cast(nullptr);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ffaef54bb9da9..1d71676ba4314 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1243,7 +1243,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::reference)
       .def("get_bytes",
            [](Variable &self) {
-             if (self.IsType<String>()) {
+             if (self.IsType<String>()) {  // NOLINT
                return py::bytes(*(self.GetMutable<String>()));
              } else {
                return py::bytes(
@@ -2232,7 +2232,7 @@ All parameter, weight, gradient are variables in Paddle.
            const std::string &var_name,
            size_t index) -> py::object {
           auto &var = framework::GetFetchVariable(scope, var_name, index);
-          if (data_is_lod_tensor(var)) {
+          if (data_is_lod_tensor(var)) {  // NOLINT
             return py::cast(PADDLE_GET(phi::DenseTensor, var));
           } else {
             return py::cast(PADDLE_GET(LoDTensorArray, var));
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index d4c5de0dbe6dc..37053cc0c09ec 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -63,6 +63,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
       return phi::Place();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case phi::Backend::GPU:
+    case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #endif
@@ -70,11 +71,6 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::ONEDNN:  // NOLINT
       return phi::CPUPlace();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case phi::Backend::GPUDNN:
-      return phi::GPUPlace(
-          set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
-#endif
 #if defined(PADDLE_WITH_XPU)
     case phi::Backend::XPU:
       return phi::XPUPlace(
diff --git a/paddle/phi/core/kernel_registry.cc b/paddle/phi/core/kernel_registry.cc
index fa9d531b6534d..6ce1af187e9a3 100644
--- a/paddle/phi/core/kernel_registry.cc
+++ b/paddle/phi/core/kernel_registry.cc
@@ -47,139 +47,159 @@ void SetKernelArgsDef(const std::vector<std::type_index>& args_type,
     ) {
 #endif
       // do nothing, skip context arg now
-    } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(const DenseTensor&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
     } else if (arg_type ==
-               std::type_index(typeid(const paddle::optional<DenseTensor>&))) {
+               std::type_index(
+                   typeid(const paddle::optional<DenseTensor>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
     } else if (arg_type ==
-               std::type_index(typeid(
-                   const paddle::optional<std::vector<const DenseTensor*>>&))) {
+               std::type_index(
+                   typeid(const paddle::optional<
+                          std::vector<const DenseTensor*>>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
     } else if (arg_type ==
-               std::type_index(typeid(const paddle::optional<SelectedRows>&))) {
+               std::type_index(
+                   typeid(const paddle::optional<SelectedRows>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(
-                               const std::vector<const DenseTensor*>&))) {
+    } else if (arg_type ==
+               std::type_index(
+                   typeid(const std::vector<const DenseTensor*>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
     } else if (arg_type ==
-               std::type_index(typeid(const phi::ExtendedTensor&))) {
+               std::type_index(typeid(const phi::ExtendedTensor&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(
-                               const std::vector<const ExtendedTensor*>&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(
+                   const std::vector<const ExtendedTensor*>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(
-                               const std::vector<const SelectedRows*>&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(
+                   const std::vector<const SelectedRows*>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
     } else if (arg_type ==
-               std::type_index(typeid(const std::vector<const TensorBase*>&))) {
+               std::type_index(
+                   typeid(const std::vector<const TensorBase*>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(
-                               const std::vector<const TensorArray*>&))) {
+    } else if (arg_type ==
+               std::type_index(
+                   typeid(const std::vector<const TensorArray*>&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(const SelectedRows&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(const StringTensor&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(const StringTensor&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(const SparseCooTensor&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(
-                               paddle::optional<const SparseCooTensor&>))) {
+    } else if (arg_type ==
+               std::type_index(typeid(
+                   paddle::optional<const SparseCooTensor&>))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(const SparseCsrTensor&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(
-                               paddle::optional<const SparseCsrTensor&>))) {
+    } else if (arg_type ==
+               std::type_index(typeid(
+                   paddle::optional<const SparseCsrTensor&>))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(const TensorArray&))) {
+    } else if (arg_type ==
+               std::type_index(typeid(const TensorArray&))) {  // NOLINT
       args_def->AppendInput(default_key.backend(),
                             default_tensor_layout,
                             default_key.dtype(),
                             arg_type);
-    } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
+    } else if (arg_type == std::type_index(typeid(DenseTensor*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(std::vector<DenseTensor*>))) {
+    } else if (arg_type ==
+               std::type_index(typeid(std::vector<DenseTensor*>))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
+    } else if (arg_type == std::type_index(typeid(SelectedRows*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(TensorArray*))) {
+    } else if (arg_type == std::type_index(typeid(TensorArray*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) {
+    } else if (arg_type ==
+               std::type_index(typeid(SparseCooTensor*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) {
+    } else if (arg_type ==
+               std::type_index(typeid(SparseCsrTensor*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(StringTensor*))) {
+    } else if (arg_type == std::type_index(typeid(StringTensor*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
                              arg_type);
-    } else if (arg_type == std::type_index(typeid(ExtendedTensor*))) {
+    } else if (arg_type ==
+               std::type_index(typeid(ExtendedTensor*))) {  // NOLINT
       args_def->AppendOutput(default_key.backend(),
                              default_tensor_layout,
                              default_key.dtype(),
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 5648ff0d469a3..b064a9f73bad6 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -236,7 +236,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
   if (!config.is_runtime && axis.FromTensor()) {
     std::vector<int64_t> vec;
     if (flatten) {
-      if (keepdims) {
+      if (keepdims) {  // NOLINT
         vec = std::vector<int64_t>(x.dims().size(), -1);
       } else {
         vec = {};
@@ -307,7 +307,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
 
   std::vector<int64_t> vec;
   if (flatten) {
-    if (keepdims) {
+    if (keepdims) {  // NOLINT
       vec = std::vector<int64_t>(x.dims().size(), 1);
     } else {
       vec = {};
@@ -4034,7 +4034,8 @@ void SplitInferMeta(const MetaTensor& x,
   if ((sections.FromTensor() && !config.is_runtime) || axis_value == -1 ||
       (axis_value >= 0 && x.dims().at(axis_value) <= 0)) {
     std::vector<phi::DDim> out_dims;
-    if ((sections.FromTensor() && !config.is_runtime) || axis_value == -1) {
+    if ((sections.FromTensor() && !config.is_runtime) ||
+        axis_value == -1) {  // NOLINT
       out_dims = std::vector<phi::DDim>(
           sections_data.size(),
           common::make_ddim(std::vector<int>(x.dims().size(), -1)));
@@ -4126,7 +4127,7 @@ void SplitWithNumInferMeta(const MetaTensor& x,
   // fill out dims with -1
   if (axis_value == -1 || (axis_value >= 0 && x.dims().at(axis_value) <= 0)) {
     std::vector<phi::DDim> out_dims;
-    if (axis_value == -1) {
+    if (axis_value == -1) {  // NOLINT
       out_dims = std::vector<phi::DDim>(
           num, common::make_ddim(std::vector<int>(x.dims().size(), -1)));
     } else {
@@ -5415,7 +5416,7 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
   }
 
   std::vector<int64_t> dim_out;
-  if (algo == "weight_only_int8" || algo == "llm.int8") {
+  if (algo == "weight_only_int8" || algo == "llm.int8") {  // NOLINT
     dim_out = std::vector<int64_t>({x_dims[1], x_dims[0]});
   } else if (algo == "weight_only_int4") {
     dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index 1bdf25dd4eb82..e9c5ae6a39e4a 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -611,7 +611,7 @@ void BatchNormDoubleGradKernel(
     EigenArrayMap<T> ddy_arr(
         ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
     ddy_arr.setZero();
-    if (use_global_stats) {
+    if (use_global_stats) {  // NOLINT
       // math: ddy = r * ddx * inv_var + ddbias +
       //           ddscale * (x - mean) * inv_var
       if (ddX) {
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
index 39d53fec10a9f..f6d5e97dc7245 100644
--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -159,7 +159,7 @@ void BatchNormKernel(const Context& ctx,
 
   // use SavedMean and SavedVariance to do normalize
   Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-  if (global_stats) {
+  if (global_stats) {  // NOLINT
     ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
     inv_std = (var_arr + epsilon).sqrt().inverse();
   } else {
@@ -178,7 +178,7 @@ void BatchNormKernel(const Context& ctx,
   auto* Bias = bias.get_ptr();
   Eigen::Array<T, Eigen::Dynamic, 1> new_scale(C);
   Eigen::Array<T, Eigen::Dynamic, 1> new_bias(C);
-  if (Scale && Bias) {
+  if (Scale && Bias) {  // NOLINT
     ConstEigenVectorArrayMap<T> scale_arr(Scale->data<T>(), C);
     ConstEigenVectorArrayMap<T> bias_arr(Bias->data<T>(), C);
     new_scale = inv_std * scale_arr;
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
index b7fdefe023e73..ed80148344e1f 100644
--- a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -35,7 +35,7 @@ void DivideKernel(const Context& dev_ctx,
   } else {
     auto x_dims = x.dims();
     auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
+    if (x_dims.size() >= y_dims.size()) {  // NOLINT
       funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
           dev_ctx, x, y, funcs::DivideFunctor<T>(), out, -1);
     } else {
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index a48d05b8d783e..8b26bf31de9bb 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -1311,7 +1311,7 @@ void RnnGradKernel(const Context& dev_ctx,
         pre_state_grad,
         weight_grad_list);
     // run gru
-  } else if (is_rnn_relu(mode)) {
+  } else if (is_rnn_relu(mode)) {  // NOLINT
     gate_num = 1;
     RnnGradFunc<SimpleRNNGradCell<T, funcs::ReluGradFunctor>,
                 SingleGradLayer,
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index a0035c6db4a75..5b594089793c8 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -868,7 +868,7 @@ void RnnKernel(const Context& dev_ctx,
         is_test,
         seed,
         reserve);
-  } else if (is_rnn_relu(mode)) {
+  } else if (is_rnn_relu(mode)) {  // NOLINT
     gate_num = 1;
     RnnFunc<SimpleRNNCell<T,
                           funcs::ReluCPUFunctor,
diff --git a/paddle/phi/kernels/funcs/sequence_pooling.cc b/paddle/phi/kernels/funcs/sequence_pooling.cc
index 004bef522ab16..f4ee9c323366e 100644
--- a/paddle/phi/kernels/funcs/sequence_pooling.cc
+++ b/paddle/phi/kernels/funcs/sequence_pooling.cc
@@ -417,7 +417,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_e = EigenMatrix<T>::From(in_t, common::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
-      if (pooltype == "AVERAGE") {
+      if (pooltype == "AVERAGE") {  // NOLINT
         out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
diff --git a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
index dafbf2889277d..84ebbf04fee11 100644
--- a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
@@ -55,7 +55,7 @@ void RemainderRawKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   auto x_dims = x.dims();
   auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
+  if (x_dims.size() >= y_dims.size()) {  // NOLINT
     funcs::ElementwiseCompute<funcs::RemainderFunctor<T>, T>(
         dev_ctx, x, y, funcs::RemainderFunctor<T>(), out, axis);
   } else {
@@ -74,7 +74,7 @@ void FloorDivideRawKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   auto x_dims = x.dims();
   auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
+  if (x_dims.size() >= y_dims.size()) {  // NOLINT
     funcs::ElementwiseCompute<funcs::FloorDivideFunctor<T>, T>(
         dev_ctx, x, y, funcs::FloorDivideFunctor<T>(), out, axis);
   } else {
diff --git a/test/cpp/fluid/framework/details/fused_broadcast_op_handle_test.cc b/test/cpp/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 786b857a80dcc..aee187d77f484 100644
--- a/test/cpp/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/test/cpp/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -56,7 +56,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     // create op handle node
     nodes_.emplace_back(
         ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
-    if (use_device_ == p::kCUDA) {
+    if (use_device_ == p::kCUDA) {  // NOLINT
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
diff --git a/test/cpp/imperative/test_gradient_accmulator.cc b/test/cpp/imperative/test_gradient_accmulator.cc
index b7b571fa196ad..12e2325873c47 100644
--- a/test/cpp/imperative/test_gradient_accmulator.cc
+++ b/test/cpp/imperative/test_gradient_accmulator.cc
@@ -376,7 +376,7 @@ static framework::Variable RandomSelectedRows(framework::DDim dims,
 
 static std::unique_ptr<GradientAccumulator> CreateAccumulator(
     const std::shared_ptr<VariableWrapper>& var, bool sort_gradient) {
-  if (sort_gradient) {
+  if (sort_gradient) {  // NOLINT
     return std::unique_ptr<GradientAccumulator>(
         new SortedGradientAccumulator(var.get()));
   } else {
@@ -400,7 +400,7 @@ static void TestGradientAccumulatorTestUnchangeInput(
   std::mt19937 engine(seed);
 
   auto create_var = [&](bool use_tensor) {
-    if (use_tensor) {
+    if (use_tensor) {  // NOLINT
       return RandomTensor<float>(dim, place);
     } else {
       return RandomSelectedRows<float>(dim, place, dist(engine));

From 8d1d18f09906f82aebfae2eb1bf404d36633ecd5 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 1 Mar 2024 11:02:46 +0800
Subject: [PATCH 46/55] [CINN] Add test for llama inference (#62153)

* fix cmake patch command to avoid patching twice error

* add test for infer llama

* fix bug of test

* fix bug

* revert other commit

* add llama forward test

* pulish log

* remove shape pass flag

---------

Co-authored-by: Silver Ling <silver.ling@outlook.com>
---
 test/ir/pir/cinn/CMakeLists.txt               |   1 +
 test/ir/pir/cinn/inference/CMakeLists.txt     |  23 +
 .../pir/cinn/inference/test_llama_forward.py  | 687 ++++++++++++++++++
 .../cinn/inference/test_llama_postprocess.py  | 123 ++++
 4 files changed, 834 insertions(+)
 create mode 100644 test/ir/pir/cinn/inference/CMakeLists.txt
 create mode 100644 test/ir/pir/cinn/inference/test_llama_forward.py
 create mode 100644 test/ir/pir/cinn/inference/test_llama_postprocess.py

diff --git a/test/ir/pir/cinn/CMakeLists.txt b/test/ir/pir/cinn/CMakeLists.txt
index 3daedfb5b4f6e..7a7d98dc37ba3 100644
--- a/test/ir/pir/cinn/CMakeLists.txt
+++ b/test/ir/pir/cinn/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory(adt)
 add_subdirectory(symbolic)
+add_subdirectory(inference)
 add_subdirectory(sub_graphs)
 
 if(WITH_GPU)
diff --git a/test/ir/pir/cinn/inference/CMakeLists.txt b/test/ir/pir/cinn/inference/CMakeLists.txt
new file mode 100644
index 0000000000000..c5ff7c9573d5e
--- /dev/null
+++ b/test/ir/pir/cinn/inference/CMakeLists.txt
@@ -0,0 +1,23 @@
+if(WITH_GPU)
+  file(
+    GLOB CINN_PIR_INFER_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "test_*.py")
+
+  foreach(cinn_pir_test_name ${CINN_PIR_INFER_TEST})
+    string(REGEX REPLACE ".py" "" cinn_pir_test_name ${cinn_pir_test_name})
+    add_test(
+      NAME ${cinn_pir_test_name}
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+        FLAGS_prim_enable_dynamic=True FLAGS_prim_all=True
+        FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=True
+        ${PYTHON_EXECUTABLE}
+        ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+    set_tests_properties(${cinn_pir_test_name} PROPERTIES LABELS
+                                                          "RUN_TYPE=CINN")
+  endforeach()
+
+endif()
diff --git a/test/ir/pir/cinn/inference/test_llama_forward.py b/test/ir/pir/cinn/inference/test_llama_forward.py
new file mode 100644
index 0000000000000..7c456ce3921d4
--- /dev/null
+++ b/test/ir/pir/cinn/inference/test_llama_forward.py
@@ -0,0 +1,687 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import unittest
+from os.path import dirname
+from typing import Optional, Tuple
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.incubate.nn.functional import swiglu
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class LlamaConfig:
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        max_position_embeddings=2048,
+        seq_length=2048,
+        num_hidden_layers=1,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+
+class LlamaRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (
+            self.base
+            ** (
+                paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32")
+                / self.dim
+            )
+        )
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        cos = self.cos_cached[:, :seq_len, :, :]
+        sin = self.sin_cached[:, :seq_len, :, :]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    if position_ids is None:
+        # Note: Only for LlamaForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(
+        paddle.ones((target_length, target_length), dtype="bool")
+    )
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat(
+            [
+                paddle.ones(
+                    [target_length, past_key_values_length], dtype="bool"
+                ),
+                mask,
+            ],
+            axis=-1,
+        )
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand(
+        [batch_size, 1, target_length, target_length + past_key_values_length]
+    )
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+    query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+    # merge with the next tranpose
+    key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+    value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+    # matmul and devide by sqrt(head_dim)
+    attn_weights = paddle.matmul(
+        query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])
+    )
+
+    # NOTE: we only call get_triangle_upper_mask under PP setup
+    # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
+    # we just make it triangle_upper_mask
+    if attention_mask is None:
+        attention_mask = get_triangle_upper_mask(attn_weights)
+    attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+
+    attn_weights = attn_weights + attention_mask
+    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(
+        query_states.dtype
+    )
+
+    attn_output = paddle.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose([0, 2, 1, 3])
+
+    attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+class LlamaMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias_attr=False
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias_attr=False
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias_attr=False
+        )
+
+    def forward(self, x):
+        x = swiglu(self.gate_proj(x), self.up_proj(x))
+        out = self.down_proj(x)
+        return out
+
+
+class LlamaRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.astype("float32")
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = (
+            paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        )
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+class LlamaAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = (
+            config.num_attention_heads // config.num_key_value_heads
+        )
+        self.gqa_or_mqa = (
+            config.num_attention_heads != config.num_key_value_heads
+        )
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+
+        self.q_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias_attr=False,
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size,
+            self.config.num_key_value_heads * self.head_dim,
+            bias_attr=False,
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size,
+            self.config.num_key_value_heads * self.head_dim,
+            bias_attr=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias_attr=False,
+        )
+
+        self._init_rope()
+
+    def _init_rope(self):
+        self.rotary_emb = LlamaRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[
+        paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]
+    ]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        target_query_shape = [0, 0, self.num_heads, self.head_dim]
+        target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+        query_states = query_states.reshape(shape=target_query_shape)
+        key_states = key_states.reshape(shape=target_key_value_shape)
+        value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, position_ids
+        )
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat(
+                [past_key_value[1], value_states], axis=1
+            )
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        outputs = scaled_dot_product_attention(
+            query_states,
+            self.config,
+            key_states,
+            value_states,
+            attention_mask,
+            output_attentions,
+        )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaDecoderLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config)
+        self.post_attention_layernorm = LlamaRMSNorm(config)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        outputs = self.self_attn(
+            hidden_states,
+            position_ids,
+            past_key_value,
+            attention_mask,
+            output_attentions,
+            use_cache,
+        )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaModel(nn.Layer):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+
+        self.layers = nn.LayerList(
+            [LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config)
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(
+        attention_mask, input_shape, past_key_values_length, dtype
+    ):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(
+                    attention_mask, dtype, tgt_length=input_shape[-1]
+                )
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = (
+                        expanded_attn_mask & combined_attention_mask
+                    )
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape, past_key_values_length=past_key_values_length
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(
+            expanded_attn_mask, 0.0, paddle.finfo(dtype).min
+        ).astype(dtype)
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        use_cache=None,
+    ):
+        output_attentions = False
+        output_hidden_states = False
+        use_cache = (
+            use_cache if use_cache is not None else self.config.use_cache
+        )
+
+        # retrieve input_ids
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids")
+
+        past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = paddle.shape(past_key_values[0][0])[1]
+            seq_length_with_past += cache_length
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones(
+                (batch_size, seq_length_with_past), dtype=paddle.bool
+            )
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand(
+                (batch_size, seq_length)
+            )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            cache_length,
+            inputs_embeds.dtype,
+        )  # [bs, 1, seq_len, seq_len]
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+
+            has_gradient = not hidden_states.stop_gradient
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_ids,
+                attention_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+            )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (
+                    layer_outputs[2 if output_attentions else 1],
+                )
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class TestLlamaModel(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.config = LlamaConfig()
+        self.input_ids = paddle.to_tensor(
+            [
+                [
+                    1,
+                    29871,
+                    31201,
+                    236,
+                    138,
+                    141,
+                    30287,
+                    30557,
+                    30015,
+                    233,
+                    187,
+                    172,
+                    31969,
+                    31325,
+                    31043,
+                    30374,
+                    30024,
+                ]
+            ],
+            dtype="int64",
+        )
+        self.position_ids = paddle.to_tensor(
+            [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]],
+            dtype="int64",
+        )
+        self.attention_mask = paddle.to_tensor(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype="int64"
+        )
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = LlamaModel(self.config)
+        input_spec = [
+            InputSpec(shape=[None, None], dtype='int64'),  # input_ids
+            InputSpec(shape=[None, None], dtype='int64'),  # position_ids
+            InputSpec(shape=[None, None], dtype='int64'),  # attention_mask
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.input_ids, self.position_ids, self.attention_mask)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/inference/test_llama_postprocess.py b/test/ir/pir/cinn/inference/test_llama_postprocess.py
new file mode 100644
index 0000000000000..dad923b4e98f7
--- /dev/null
+++ b/test/ir/pir/cinn/inference/test_llama_postprocess.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class LlamaPostProcess(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def update_scores_for_generation(
+        self, scores, next_scores, length, unfinished_flag
+    ):
+        # update scores
+        unfinished_scores = (scores * length + next_scores) / (length + 1)
+        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
+        return scores
+
+    def _post_process_(
+        self, logits, input_ids, cur_len, origin_len, scores, unfinished_flag
+    ):
+        # [batch_size, vocab_size]
+        logits = logits[:, -1, :]
+        probs = F.softmax(logits)
+
+        temperature = paddle.full([1], 1)
+        top_p = paddle.full([1], 0)
+
+        # sample
+        origin_probs = F.log_softmax(logits)
+        # compute next_tokens
+        logits = logits / temperature
+        top_ps_tensor = paddle.full(
+            shape=[paddle.shape(probs)[0], 1],
+            fill_value=top_p,
+            dtype=probs.dtype,
+        )
+        _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor)
+
+        next_scores = paddle.index_sample(origin_probs, next_tokens)
+        scores = self.update_scores_for_generation(
+            scores, next_scores, cur_len - origin_len, unfinished_flag
+        )
+
+        input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+
+        return input_ids, scores
+
+    def forward(self, logits, input_ids):
+        batch_size, cur_len = paddle.shape(input_ids)
+        origin_len = paddle.shape(input_ids)[1]
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+        scores = paddle.full(
+            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype()
+        )
+        return self._post_process_(
+            logits, input_ids, cur_len, origin_len, scores, unfinished_flag
+        )
+
+
+class TestLlamaPostProcess(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shape = [1, 2048, 768]
+        self.logits = paddle.randn([1, 256, 3200], dtype="float32")
+        self.input_ids = paddle.randint(0, 512, [1, 32], dtype="int64")
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = LlamaPostProcess()
+        input_spec = [
+            InputSpec(shape=[None, None, None], dtype='float32'),  # logits
+            InputSpec(shape=[None, None], dtype='int64'),  # input_ids
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        # paddle.jit.save(net, sys.path.join(dirname(__file__), "post_model"))
+        out = net(self.logits, self.input_ids)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()

From f9f6d408482897915dedaa7764bfb30feb73367c Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 1 Mar 2024 11:15:45 +0800
Subject: [PATCH 47/55]  Fix calibraion calibration, etc (#62259)

---
 .../analysis/ir_passes/tensorrt_subgraph_pass.cc       |  2 +-
 paddle/fluid/inference/api/paddle_analysis_config.h    |  8 ++++----
 paddle/fluid/inference/api/resource_manager.cc         | 10 +++++-----
 paddle/fluid/inference/api/resource_manager.h          |  2 +-
 paddle/fluid/inference/capi/pd_config.cc               |  4 ++--
 paddle/fluid/inference/capi/pd_predictor.cc            |  2 +-
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 5b2bed7745fcf..1b29ba37f5e66 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -754,7 +754,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   bool calibration_mode =
       (enable_int8 && calibration_data.empty() && use_calib_mode);
   if (calibration_mode) {
-    // calibraion mode means generate int8 calibration table data process.
+    // calibration mode means generate int8 calibration table data process.
     return calibration_engine_key;
   }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index cae544ff2c234..134c0799ec663 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -253,7 +253,7 @@ struct PD_INFER_DECL AnalysisConfig {
   void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
 
   ///
-  /// \brief Set the combined model with two specific pathes for program and
+  /// \brief Set the combined model with two specific paths for program and
   /// parameters.
   ///
   /// \param prog_file_path model file path of the combined model.
@@ -596,12 +596,12 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \brief Control whether to perform IR graph optimization.
   /// If turned off, the AnalysisConfig will act just like a NativeConfig.
   ///
-  /// \param x Whether the ir graph optimization is actived.
+  /// \param x Whether the ir graph optimization is activated.
   ///
   void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
   ///
   /// \brief A boolean state telling whether the ir graph optimization is
-  /// actived.
+  /// activated.
   ///
   /// \return bool Whether to use ir graph optimization.
   ///
@@ -1213,7 +1213,7 @@ struct PD_INFER_DECL AnalysisConfig {
   std::string SerializeInfoCache();
 
  protected:
-  // Model pathes.
+  // Model paths.
   std::string model_dir_;
   mutable std::string prog_file_;
   mutable std::string params_file_;
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index b18ca6e1c2a55..9f8a6651ebdf8 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -191,7 +191,7 @@ void GPUContextResource::InitGpuEigenDevice() {
   gpu_eigen_device_ = std::make_unique<Eigen::GpuDevice>(eigen_stream_.get());
 }
 
-void GPUContextResource::InitDnnHanlde() {
+void GPUContextResource::InitDnnHandle() {
   phi::InitDnnHandle(&dnn_handle_, stream_, place_);
 }
 
@@ -237,7 +237,7 @@ dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
 
 std::function<phi::dnnHandle_t()> GPUContextResource::GetDnnHandleCreator() {
   return [&]() -> phi::dnnHandle_t {
-    InitDnnHanlde();
+    InitDnnHandle();
     return dnn_handle_;
   };
 }
@@ -367,7 +367,7 @@ ResourceManager& ResourceManager::Instance() {
 }
 
 void ResourceManager::InitCPUResource() {
-  std::lock_guard<std::mutex> lock_gurad(cpu_mutex_);
+  std::lock_guard<std::mutex> lock_guard(cpu_mutex_);
   if (cpu_resource_ == nullptr) {
     cpu_resource_ = std::make_unique<CPUContextResource>();
   }
@@ -382,7 +382,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
-  std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
+  std::lock_guard<std::mutex> lock_guard(gpu_mutex_);
   if (gpu_resources_.count(stream)) {
     Increase(stream);
     return stream;
@@ -427,7 +427,7 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
 void ResourceManager::GpuResourceSwitchStream(void* old_stream,
                                               void* new_stream) {
   // NOTE: add lock to support stream rebind in multi-thread
-  std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
+  std::lock_guard<std::mutex> lock_guard(gpu_mutex_);
   if (old_stream == new_stream) return;
   PADDLE_ENFORCE_EQ(
       gpu_resources_.count(old_stream),
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 1f4d4ea420e1b..25b4050e7c4dd 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -88,7 +88,7 @@ class GPUContextResource {
   void DestroyGPUResource();
   void InitGpuProperties();
   void InitGpuEigenDevice();
-  void InitDnnHanlde();
+  void InitDnnHandle();
   void DestroyDnnHandle();
   void DestroyBlasHandle();
   void InitBlasLtHandle();
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 5197b8dede192..c2c8036ece7a8 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -275,7 +275,7 @@ void PD_EnableDlnne(
     int max_batch_size,
     bool use_static_batch,
     std::string weight_share_mode,
-    std::unordered_set<std::string> disable_nodes_by_ouputs,
+    std::unordered_set<std::string> disable_nodes_by_outputs,
     std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
     bool use_calib_mode,
     PD_ACPrecision precision_mode) {
@@ -287,7 +287,7 @@ void PD_EnableDlnne(
                              max_batch_size,
                              use_static_batch,
                              weight_share_mode,
-                             disable_nodes_by_ouputs,
+                             disable_nodes_by_outputs,
                              dlnne_input_shape_dict,
                              use_calib_mode,
                              precision_mode);
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 39575a196e4f9..72f1b6c277153 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -92,7 +92,7 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config,
       config,
       paddle::platform::errors::InvalidArgument(
           "The pointer of analysis configuration shouldn't be nullptr"));
-  VLOG(3) << "Predoctor: PD_PredictorRun. ";
+  VLOG(3) << "Predictor: PD_PredictorRun. ";
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
   if (!predictors.count(config->config.model_dir())) {

From 512d594060232ea1131ff3379ed0dd769f0ef4ed Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 1 Mar 2024 11:16:12 +0800
Subject: [PATCH 48/55]  Fix is_sparese is_sparse, etc (#62258)

---
 .../fluid/distributed/collective/reducer.cc   |  2 +-
 .../distributed/ps/service/brpc_ps_client.cc  |  6 ++---
 .../distributed/ps/service/brpc_ps_server.cc  | 22 +++++++++----------
 .../ps/service/coordinator_client.h           |  4 ++--
 .../ps/service/graph_brpc_server.cc           |  2 +-
 paddle/fluid/imperative/prepared_operator.h   |  2 +-
 paddle/fluid/imperative/reducer.cc            |  6 +++--
 7 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 68ccd8f52fa10..df41993bb9bd2 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -894,7 +894,7 @@ void EagerReducer::MarkVarReady(const size_t var_index,
             "The sparse parameter[%d][%s] should have gradient. "
             "Currently, DataParallel does not support sparse "
             "parameters without generating gradients during training. "
-            "For example, if is_sparese=True is used in Embedding, "
+            "For example, if is_sparse=True is used in Embedding, "
             "the current step of this parameter cannot generate gradient "
             "because of stop_gradient/detach, where error will occur.",
             var_index,
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 89150deff544a..fa9f16db05b6e 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -402,7 +402,7 @@ int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
 int DownpourBrpcClosure::check_save_response(size_t request_idx, int cmd_id) {
   int32_t feasign_size = 0;
   if (_cntls[request_idx]->Failed()) {
-    LOG(ERROR) << "resquest cmd_id:" << cmd_id
+    LOG(ERROR) << "request cmd_id:" << cmd_id
                << " failed, "
                   "err:"
                << _cntls[request_idx]->ErrorText();
@@ -426,7 +426,7 @@ std::string DownpourBrpcClosure::get_response(size_t request_idx, int cmd_id) {
 
 int FlClientBrpcClosure::check_response(size_t request_idx, int cmd_id) {
   if (_cntls[request_idx]->Failed()) {
-    LOG(ERROR) << "resquest cmd_id:" << cmd_id
+    LOG(ERROR) << "request cmd_id:" << cmd_id
                << " failed, "
                   "err:"
                << _cntls[request_idx]->ErrorText();
@@ -1712,7 +1712,7 @@ void BrpcPsClient::PushSparseTaskConsume() {
           merge_status[shard_idx].wait();
         }
 
-        // meger到task_list[0]
+        // merge到task_list[0]
         auto async_task = new SparseAsyncTask(*(task_list[0].get()));
 
         task_queue->Put(std::move(async_task));
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
index 8d73a563d79f1..b1c58ba7acda4 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -262,7 +262,7 @@ void BrpcPsService::service(google::protobuf::RpcController *cntl_base,
   brpc::ClosureGuard done_guard(done);
   std::string log_label("ReceiveCmd-");
   if (!request->has_table_id()) {
-    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    set_response_code(*response, -1, "PsRequestMessage.table_id is required");
     return;
   }
 
@@ -307,7 +307,7 @@ int32_t BrpcPsService::PullDense(Table *table,
     set_response_code(
         response,
         -1,
-        "PsRequestMessage.datas is requeired at least 1 for num of dense");
+        "PsRequestMessage.datas is required at least 1 for num of dense");
     return 0;
   }
   CostTimer timer("pserver_server_pull_dense");
@@ -409,7 +409,7 @@ int32_t BrpcPsService::Barrier(Table *table,
   if (request.params_size() < 1) {
     set_response_code(response,
                       -1,
-                      "PsRequestMessage.params is requeired at "
+                      "PsRequestMessage.params is required at "
                       "least 1 for num of sparse_key");
     return 0;
   }
@@ -436,7 +436,7 @@ int32_t BrpcPsService::PushSparseParam(Table *table,
   if (request.params_size() < 1) {
     set_response_code(response,
                       -1,
-                      "PsRequestMessage.params is requeired at "
+                      "PsRequestMessage.params is required at "
                       "least 1 for num of sparse_key");
     return 0;
   }
@@ -515,7 +515,7 @@ int32_t BrpcPsService::PullSparse(Table *table,
   if (request.params_size() < 1) {
     set_response_code(response,
                       -1,
-                      "PsRequestMessage.params is requeired at "
+                      "PsRequestMessage.params is required at "
                       "least 1 for num of sparse_key");
     return 0;
   }
@@ -565,7 +565,7 @@ int32_t BrpcPsService::PushSparse(Table *table,
   if (request.params_size() < 1) {
     set_response_code(response,
                       -1,
-                      "PsRequestMessage.params is requeired at "
+                      "PsRequestMessage.params is required at "
                       "least 1 for num of sparse_key");
     return 0;
   }
@@ -616,7 +616,7 @@ int32_t BrpcPsService::LoadOneTable(Table *table,
     set_response_code(
         response,
         -1,
-        "PsRequestMessage.datas is requeired at least 2 for path & load_param");
+        "PsRequestMessage.datas is required at least 2 for path & load_param");
     return -1;
   }
   if (table->Load(request.params(0), request.params(1)) != 0) {
@@ -649,7 +649,7 @@ int32_t BrpcPsService::SaveOneTable(Table *table,
     set_response_code(
         response,
         -1,
-        "PsRequestMessage.datas is requeired at least 2, path&mode");
+        "PsRequestMessage.datas is required at least 2, path&mode");
     return -1;
   }
   table->Flush();
@@ -691,7 +691,7 @@ int32_t BrpcPsService::SaveCacheTable(Table *table,
     set_response_code(
         response,
         -1,
-        "PsRequestMessage.datas is requeired at least 3, path&mode");
+        "PsRequestMessage.datas is required at least 3, path&mode");
     return -1;
   }
   table->Flush();
@@ -717,7 +717,7 @@ int32_t BrpcPsService::CacheShuffle(Table *table,
   if (request.params_size() < 3) {
     set_response_code(response,
                       -1,
-                      "PsRequestMessage.datas is requeired at least 3, "
+                      "PsRequestMessage.datas is required at least 3, "
                       "path&mode&cache_threshold");
     return -1;
   }
@@ -805,7 +805,7 @@ int32_t BrpcPsService::ShrinkTable(Table *table,
     set_response_code(
         response,
         -1,
-        "PsRequestMessage.datas is requeired at least 1, threshold");
+        "PsRequestMessage.datas is required at least 1, threshold");
     return -1;
   }
   table->Flush();
diff --git a/paddle/fluid/distributed/ps/service/coordinator_client.h b/paddle/fluid/distributed/ps/service/coordinator_client.h
index 8db08c3fc7999..f0d1116fca268 100644
--- a/paddle/fluid/distributed/ps/service/coordinator_client.h
+++ b/paddle/fluid/distributed/ps/service/coordinator_client.h
@@ -81,7 +81,7 @@ class CoordinatorServiceHandle {
     lck.unlock();
     VLOG(0) << "last_round_total_fl_clients_num: "
             << last_round_total_fl_clients_num
-            << ", has recved fl client num: " << _fl_clients_count.load();
+            << ", has received fl client num: " << _fl_clients_count.load();
     return;
   }
 
@@ -102,7 +102,7 @@ class CoordinatorServiceHandle {
         timeline.Pause();
         query_wait_time += timeline.ElapsedSec();
       }
-      // LOG(WARNNING) << "fl-ps > query_wait_time exceed!";
+      // LOG(WARNING) << "fl-ps > query_wait_time exceed!";
       return true;
     };
 
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 0a8867bb66e11..df0c1a8fd3a6c 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -247,7 +247,7 @@ void GraphBrpcService::service(google::protobuf::RpcController *cntl_base,
   brpc::ClosureGuard done_guard(done);
   std::string log_label("ReceiveCmd-");
   if (!request->has_table_id()) {
-    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    set_response_code(*response, -1, "PsRequestMessage.table_id is required");
     return;
   }
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 70c36b27d31c0..42a50cec23558 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -559,7 +559,7 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
         PADDLE_ENFORCE_NOT_NULL(
             attr_ptr,
             platform::errors::NotFound("(%s) is not found in AttributeMap when "
-                                       "buildind dygraph KernelContext.",
+                                       "building dygraph KernelContext.",
                                        attr_names[i]));
         auto& attr = *attr_ptr;
         switch (attr_defs[i].type_index) {
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 5b8dc28d03111..93e6b10e6488e 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -493,8 +493,10 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
                 "using PyLayer in a DataParallel model, you can skip gradient "
                 "synchronization among multiple cards by 'no_sync', and "
                 "manually implement 'all_reduce' before model optimization. "
-                "There is an example showing specific implemetation processing "
-                "in offical docs: https://www.paddlepaddle.org.cn/documentation"
+                "There is an example showing specific implementation "
+                "processing "
+                "in official docs: "
+                "https://www.paddlepaddle.org.cn/documentation"
                 "/docs/api/paddle/DataParallel_cn.html"));
       }
       ++node_deps_[grad_pending_node.get()];

From 6b3f074c0e960a3e5f9235362005fe2340d96cd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Fri, 1 Mar 2024 11:20:47 +0800
Subject: [PATCH 49/55] =?UTF-8?q?=E3=80=90paddle=5Ftest=20No.27=E3=80=91re?=
 =?UTF-8?q?place=20parts=20of=20cc=5Ftest=20with=20paddle=5Ftest=20=20(#61?=
 =?UTF-8?q?675)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update CMakeLists.txt

* add TEST_API and rm use_it_self_op

* fix code-style

* Update CMakeLists.txt

* Apply suggestions from code review

* Update CMakeLists.txt

* Update test_common_infer_shape_functions.cc

* replace cc with paddle_test

* Update selected_rows.h

* delete use_op_itself

* Update CMakeLists.txt

* add TEST_API

* Update copy_cross_scope_test.cc

* try to add TEST_API

* try to add TEST_API

* Update CMakeLists.txt
---
 paddle/fluid/framework/shape_inference.h      |  7 ++-
 paddle/fluid/imperative/var_helper.h          |  2 +-
 .../memory/allocation/allocator_facade.h      | 13 ++--
 paddle/fluid/memory/memcpy.cc                 | 34 +++++------
 paddle/fluid/memory/memcpy.h                  |  4 +-
 .../operators/common_infer_shape_functions.h  |  7 ++-
 paddle/phi/core/selected_rows.h               |  3 +-
 test/cpp/fluid/CMakeLists.txt                 | 60 ++++---------------
 test/cpp/fluid/copy_cross_scope_test.cc       |  2 -
 test/cpp/fluid/save_load_combine_op_test.cc   |  5 --
 test/cpp/fluid/save_load_op_test.cc           |  4 --
 test/cpp/fluid/share_buffer_op_test.cc        |  8 ---
 12 files changed, 50 insertions(+), 99 deletions(-)

diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 49603b34255db..427d4be4558e9 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -78,13 +78,14 @@ class InferShapeContext {
 
   virtual DDim GetInputDim(const std::string &name) const = 0;
   virtual std::vector<DDim> GetInputsDim(const std::string &name) const = 0;
-  virtual std::vector<DDim> GetReaderDims(const std::string &name) const;
+  TEST_API virtual std::vector<DDim> GetReaderDims(
+      const std::string &name) const;
 
   virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
   virtual void SetOutputsDim(const std::string &name,
                              const std::vector<DDim> &dims) = 0;
-  virtual void SetReaderDims(const std::string &name,
-                             const std::vector<DDim> &dims);
+  TEST_API virtual void SetReaderDims(const std::string &name,
+                                      const std::vector<DDim> &dims);
   virtual std::string GetInputNameByIdx(size_t idx) const = 0;
   virtual std::string GetOutputNameByIdx(size_t idx) const = 0;
   virtual AttrReader Attrs() const = 0;
diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h
index ebf3e49c51870..1a74d987e7e2b 100644
--- a/paddle/fluid/imperative/var_helper.h
+++ b/paddle/fluid/imperative/var_helper.h
@@ -40,7 +40,7 @@ void InitializeVariable(paddle::framework::Variable* var,
 template <typename VarType>
 const paddle::platform::Place& GetPlace(const std::shared_ptr<VarType>& var);
 template <typename VarType>
-const std::string& GetNameFromVar(std::shared_ptr<VarType> var);
+TEST_API const std::string& GetNameFromVar(std::shared_ptr<VarType> var);
 
 template <typename VarType>
 bool CheckCachedKey(std::shared_ptr<VarType> tensor, const phi::KernelKey& key);
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index f80fcac1b2a38..f0f321b887b59 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -49,11 +49,12 @@ class AllocatorFacade {
   const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
   ~AllocatorFacade();
 
-  static AllocatorFacade& Instance();
+  TEST_API static AllocatorFacade& Instance();
 
   AllocatorFacadePrivate* GetPrivate() const;
 
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
+  TEST_API const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place);
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
@@ -88,8 +89,8 @@ class AllocatorFacade {
   void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
   void EraseStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
 
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
-                                                 gpuStream_t stream);
+  TEST_API const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place, gpuStream_t stream);
   gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) const;
   void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream);
 #endif
@@ -104,8 +105,8 @@ class AllocatorFacade {
                    phi::stream::stream_t stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     phi::stream::stream_t stream);
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
-                                                 phi::stream::stream_t stream);
+  TEST_API const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place, phi::stream::stream_t stream);
   phi::stream::stream_t GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
   void SetDefaultStream(const platform::CustomPlace& place,
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 7cdf93514c52c..6ba7b4ac1d613 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -638,12 +638,12 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
+TEST_API void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                              void* dst,
+                                              phi::Place src_place,
+                                              const void* src,
+                                              size_t num,
+                                              void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
@@ -835,11 +835,11 @@ TEST_API void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
 
 // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num) {
+TEST_API void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                              void* dst,
+                                              phi::Place src_place,
+                                              const void* src,
+                                              size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num);
 }
 
@@ -872,12 +872,12 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
 }
 
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     void* stream) {
+TEST_API void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                              void* dst,
+                                              phi::Place src_place,
+                                              const void* src,
+                                              size_t num,
+                                              void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index c8d9208c48219..b0a9234817f0a 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -31,7 +31,7 @@ namespace memory {
  *
  */
 template <typename DstPlace, typename SrcPlace>
-void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+TEST_API void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
 /**
  * \brief   Copy memory from one place to another place.
@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
  *
  */
 template <typename DstPlace, typename SrcPlace>
-void Copy(
+TEST_API void Copy(
     DstPlace, void* dst, SrcPlace, const void* src, size_t num, void* stream);
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/common_infer_shape_functions.h b/paddle/fluid/operators/common_infer_shape_functions.h
index 5ce21b1de529b..a61686f3f7544 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.h
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
@@ -34,12 +34,13 @@ framework::DDim BroadcastTwoDims(const framework::DDim& x_dims,
                                  int axis = -1);
 }
 // shape input(0) -> output(0) without change.
-void UnaryOpUnchangedInferShape(framework::InferShapeContext* ctx);
+TEST_API void UnaryOpUnchangedInferShape(framework::InferShapeContext* ctx);
 // shape input(0) -> output(0) without change, check if axis in range [-Rank(x),
 // Rank(x)-1]
-void UnaryOpUnchangedInferShapeCheckAxis(framework::InferShapeContext* ctx);
+TEST_API void UnaryOpUnchangedInferShapeCheckAxis(
+    framework::InferShapeContext* ctx);
 // broadcast input(0) and input(1) -> output(0)
-void BinaryOpBroadcastInferShape(framework::InferShapeContext* ctx);
+TEST_API void BinaryOpBroadcastInferShape(framework::InferShapeContext* ctx);
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h
index 7674a8e8722bc..145f7e7d3b2e4 100644
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -42,7 +42,8 @@ class SelectedRows : public TensorBase,
    *
    */
  public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height);
+  TEST_API SelectedRows(const std::vector<int64_t>& rows,
+                        const int64_t& height);
 
   TEST_API SelectedRows();
 
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index f49eefb4354d0..3a8f9326764cb 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -33,14 +33,12 @@ endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} executor)
 
 if(WITH_XPU)
-  cc_test(
-    beam_search_decode_op_xpu_test
-    SRCS beam_search_decode_op_xpu_test.cc
-    DEPS lod_tensor)
+  paddle_test(beam_search_decode_op_xpu_test SRCS
+              beam_search_decode_op_xpu_test.cc)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
 endif()
 
-cc_test(
+nv_test(
   test_common_infer_shape_functions
   SRCS test_common_infer_shape_functions.cc
   DEPS common_infer_shape_functions
@@ -51,30 +49,12 @@ cc_test(
        phi
        common
        generated_static_op)
-cc_test(
-  gather_test
-  SRCS gather_test.cc
-  DEPS tensor)
-cc_test(
-  assign_op_test
-  SRCS assign_op_test.cc
-  DEPS generated_static_op)
-cc_test(
-  scatter_test
-  SRCS scatter_test.cc
-  DEPS tensor phi common)
-cc_test(
-  beam_search_decode_op_test
-  SRCS beam_search_decode_op_test.cc
-  DEPS lod_tensor)
-cc_test(
-  save_load_op_test
-  SRCS save_load_op_test.cc
-  DEPS save_op load_op)
-cc_test(
-  save_load_combine_op_test
-  SRCS save_load_combine_op_test.cc
-  DEPS save_combine_op load_combine_op)
+paddle_test(gather_test SRCS gather_test.cc)
+paddle_test(assign_op_test SRCS assign_op_test.cc)
+paddle_test(scatter_test SRCS scatter_test.cc DEPS common)
+paddle_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc)
+paddle_test(save_load_op_test SRCS save_load_op_test.cc)
+paddle_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc)
 if(WITH_CINN)
   set(CINN_DEPS python)
 endif()
@@ -109,15 +89,10 @@ elseif(WITH_ROCM)
          test_leaky_relu_grad_grad_functor.cu
     DEPS tensor device_context eigen3)
 else()
-  cc_test(
-    test_leaky_relu_grad_grad_functor
-    SRCS test_leaky_relu_grad_grad_functor.cc
-    DEPS tensor device_context eigen3)
+  paddle_test(test_leaky_relu_grad_grad_functor SRCS
+              test_leaky_relu_grad_grad_functor.cc)
 endif()
-cc_test(
-  share_buffer_op_cpp_test
-  SRCS share_buffer_op_test.cc
-  DEPS lod_tensor device_context generated_static_op)
+paddle_test(share_buffer_op_cpp_test SRCS share_buffer_op_test.cc)
 
 if(WITH_CINN)
   paddle_test(op_debug_string_test SRCS op_debug_string_test.cc)
@@ -126,16 +101,7 @@ else()
 endif()
 
 if(WITH_GPU)
-  cc_test(
-    copy_cross_scope_test
-    SRCS copy_cross_scope_test.cc
-    DEPS op_registry
-         copy_cross_scope_op
-         scope
-         device_context
-         enforce
-         executor
-         common)
+  paddle_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc)
 endif()
 
 if(WITH_ONNXRUNTIME AND WIN32)
diff --git a/test/cpp/fluid/copy_cross_scope_test.cc b/test/cpp/fluid/copy_cross_scope_test.cc
index f6f7eb31cb8e6..3d2033d77fe80 100644
--- a/test/cpp/fluid/copy_cross_scope_test.cc
+++ b/test/cpp/fluid/copy_cross_scope_test.cc
@@ -33,8 +33,6 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_NO_KERNEL_OP(copy_cross_scope);
-
 template <typename T>
 void Compare1(f::Scope* scope,
               const p::DeviceContext& ctx,
diff --git a/test/cpp/fluid/save_load_combine_op_test.cc b/test/cpp/fluid/save_load_combine_op_test.cc
index 8f85676b1ba55..f97409d6535ab 100644
--- a/test/cpp/fluid/save_load_combine_op_test.cc
+++ b/test/cpp/fluid/save_load_combine_op_test.cc
@@ -22,11 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-USE_OP_ITSELF(save_combine);
-USE_OP_ITSELF(load_combine);
-PD_DECLARE_KERNEL(save_combine_tensor, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(load_combine, CPU, ALL_LAYOUT);
-
 template <typename T, typename U>
 T* CreateForSaveCombineOp(int x,
                           int y,
diff --git a/test/cpp/fluid/save_load_op_test.cc b/test/cpp/fluid/save_load_op_test.cc
index 5ec376b71de17..5ddb0afb03616 100644
--- a/test/cpp/fluid/save_load_op_test.cc
+++ b/test/cpp/fluid/save_load_op_test.cc
@@ -17,12 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-USE_OP_ITSELF(save);
-PD_DECLARE_KERNEL(save, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(save_sr, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
-USE_OP_ITSELF(load);
-PD_DECLARE_KERNEL(load, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(load_sr, CPU, ALL_LAYOUT);
 
 TEST(SaveLoadOp, CPU) {
diff --git a/test/cpp/fluid/share_buffer_op_test.cc b/test/cpp/fluid/share_buffer_op_test.cc
index d576ba6ecfcea..eb042acf06ff2 100644
--- a/test/cpp/fluid/share_buffer_op_test.cc
+++ b/test/cpp/fluid/share_buffer_op_test.cc
@@ -20,14 +20,6 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-USE_OP_ITSELF(share_buffer);
-
-PD_DECLARE_KERNEL(share_buffer, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_DECLARE_KERNEL(share_buffer, GPU, ALL_LAYOUT);
-#endif
-
 namespace paddle {
 namespace framework {
 

From 7620c500fa7b85790661a50265c23b1bf32d3b63 Mon Sep 17 00:00:00 2001
From: Tian <121000916+SylarTiaNII@users.noreply.github.com>
Date: Fri, 1 Mar 2024 11:21:06 +0800
Subject: [PATCH 50/55] [Distributed] fix sharding overlap comm on npu (#62236)

---
 .../fleet/meta_parallel/sharding/group_sharded_utils.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 046143c79842f..552d36afb1dda 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -341,7 +341,10 @@ def cvt_to_device(x, dev_id, blocking=True):
     elif paddle.is_compiled_with_xpu():
         place = paddle.XPUPlace(dev_id)
     else:
-        raise OSError(
-            "Only supported compiled paddle with gpu/rocm and xpu, but current version is compiled with cpu."
-        )
+        supported_custom_devices = ["npu"]
+        place = paddle.framework._current_expected_place()
+        if place.get_device_type() not in supported_custom_devices:
+            raise OSError(
+                "Only supported compiled paddle with gpu/rocm and xpu, but current version is compiled with cpu."
+            )
     return x._copy_to(place, blocking)

From 85ba93655e6ed9e0eb4f04ef62bbfb312796f3f4 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Fri, 1 Mar 2024 14:33:27 +0800
Subject: [PATCH 51/55] fix delete scale and zero_point var bug (#62225)

* fix delete scale and zero_point var bug
---
 .../ir/delete_quant_dequant_linear_op_pass.cc   | 17 +++++++----------
 paddle/fluid/framework/ir/fuse_pass_base.h      |  5 +++++
 .../trt_delete_weight_dequant_linear_op_pass.cc | 17 +++++++----------
 .../passes/save_optimized_model_pass.cc         | 12 ++++++++++--
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 9d4006e6f3943..b8a5dfdaa9465 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -124,14 +124,18 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
     */
-    std::unordered_set<const Node*> nodes2rm = {};
-
-    // delete Scale and ZeroPoint tensor in scope
+    // Scale and ZeroPoint tensor should be removed in save_optimized_model_pass
     std::vector<std::string> vars2rm = {};
     vars2rm.emplace_back(quantize_linear_op->Op()->Input("Scale")[0]);
     vars2rm.emplace_back(quantize_linear_op->Op()->Input("ZeroPoint")[0]);
     vars2rm.emplace_back(dequantize_linear_op->Op()->Input("Scale")[0]);
     vars2rm.emplace_back(dequantize_linear_op->Op()->Input("ZeroPoint")[0]);
+    auto& scale_and_zero_point_param = g->GetOrInit<std::vector<std::string>>(
+        framework::ir::kScaleAndZeroPointParamAttr);
+    scale_and_zero_point_param.insert(
+        scale_and_zero_point_param.end(), vars2rm.begin(), vars2rm.end());
+
+    std::unordered_set<const Node*> nodes2rm = {};
 
     // Get input scale from tensor
     const phi::DenseTensor& input_scale_tensor =
@@ -182,13 +186,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
     nodes2rm.insert(dequantize_linear_op);
     nodes2rm.insert(dequantize_linear_op_out);
     GraphSafeRemoveNodes(graph, nodes2rm);
-
-    for (auto& var_name : vars2rm) {
-      if (scope->FindVar(var_name)) {
-        scope->EraseVars({var_name});
-      }
-    }
-
     found_count++;
   };
   gpd(graph, handler);
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index bc5fc2a16d393..d8522f1aeaabe 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -40,6 +40,11 @@ static const char kFuseStatisAttr[] = "__fuse_statis__";
 // allocation.
 static const char kRepetitiveParamAttr[] = "__repetitive_param__";
 
+// scale and zero point of the quantized/dequantized op should be removed in
+// save_optimized_model_pass.
+static const char kScaleAndZeroPointParamAttr[] =
+    "__scale_and_zero_point_param__";
+
 enum FuseOptions {
   DO_NOT_FUSE,  // fusing will not be done
   FUSE_NATIVE,  // fusing will be done without MKL-DNN
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index b780c07fda0a6..6bc9cb324d80d 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -231,13 +231,17 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
       return;
     }
     */
-    std::unordered_set<const Node*> nodes2rm = {};
-
-    // delete Scale and ZeroPoint tensor in scope
+    // Scale and ZeroPoint tensor should be removed in save_optimized_model_pass
     std::vector<std::string> vars2rm = {};
     vars2rm.emplace_back(weight_dequantize_linear_op->Op()->Input("Scale")[0]);
     vars2rm.emplace_back(
         weight_dequantize_linear_op->Op()->Input("ZeroPoint")[0]);
+    auto& scale_and_zero_point_param = g->GetOrInit<std::vector<std::string>>(
+        framework::ir::kScaleAndZeroPointParamAttr);
+    scale_and_zero_point_param.insert(
+        scale_and_zero_point_param.end(), vars2rm.begin(), vars2rm.end());
+
+    std::unordered_set<const Node*> nodes2rm = {};
 
     int bit_length = PADDLE_GET_CONST(
         int, weight_dequantize_linear_op->Op()->GetAttr("bit_length"));
@@ -363,13 +367,6 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
     }
 
     GraphSafeRemoveNodes(graph, nodes2rm);
-
-    for (auto& var_name : vars2rm) {
-      if (scope->FindVar(var_name)) {
-        scope->EraseVars({var_name});
-      }
-    }
-
     found_count++;
   };
   gpd(graph, handler);
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index 8d988de162100..89b49df107390 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <unordered_set>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -37,10 +38,17 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
 
   framework::ir::GraphToProgram(*graph, &optimized_program_desc);
 
-  // Some vars may be deleted by pass, so we need to remove them in block
+  // Remove the scale and zero point parameters from optimized program.
+  auto scale_and_zero_point_param = graph->GetOrInit<std::vector<std::string>>(
+      framework::ir::kScaleAndZeroPointParamAttr);
   framework::BlockDesc* block = optimized_program_desc.MutableBlock(0);
   for (auto& var_desc : block->AllVars()) {
-    if (var_desc->Persistable() && !scope.FindVar(var_desc->Name())) {
+    auto var_name = var_desc->Name();
+    if (var_desc->Persistable() && scope.FindVar(var_name) &&
+        std::count(scale_and_zero_point_param.begin(),
+                   scale_and_zero_point_param.end(),
+                   var_name) > 0) {
+      scope.EraseVars({var_name});
       block->RemoveVar(var_desc->Name());
     }
   }

From 9c1ff4b922eb7096fed049d777374a8202c5cde7 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Fri, 1 Mar 2024 14:33:46 +0800
Subject: [PATCH 52/55] [Prim][PIR] Add simple llama config for llama eval test
 (#62208)

* add llama config program txt

* polish test case

* polish code

* fix code

* fix file path

* fix test case

* fix test case
---
 test/ir/pir/cinn/symbolic/CMakeLists.txt      |  13 +
 test/ir/pir/cinn/symbolic/simple_llama.config | 252 ++++++++++++++++++
 .../pir/cinn/symbolic/test_simple_llama_dy.py | 217 +++++++++++++++
 3 files changed, 482 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/simple_llama.config
 create mode 100644 test/ir/pir/cinn/symbolic/test_simple_llama_dy.py

diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index 9f26f4dd17269..9d2fc16e2c638 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -7,6 +7,7 @@ if(WITH_GPU)
   list(
     REMOVE_ITEM
     CINN_PIR_SYMBOLIC_TEST
+    test_simple_llama_dy.py
     test_cinn_reduce_symbolic_demo.py
     test_if_st.py
     test_if_dy.py
@@ -71,6 +72,18 @@ if(WITH_GPU)
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_llama_if_dy PROPERTIES LABELS "RUN_TYPE=CINN")
 
+  add_test(
+    NAME test_simple_llama_dy
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+      FLAGS_prim_enable_dynamic=true FLAGS_prim_check_ops=true
+      FLAGS_enable_pir_api=true FLAGS_cinn_bucket_compile=false
+      FLAGS_pir_apply_shape_optimization_pass=false ${PYTHON_EXECUTABLE}
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_simple_llama_dy.py
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_tests_properties(test_simple_llama_dy PROPERTIES LABELS "RUN_TYPE=CINN")
+
   add_test(
     NAME test_decomp_inference_predictor_run
     COMMAND
diff --git a/test/ir/pir/cinn/symbolic/simple_llama.config b/test/ir/pir/cinn/symbolic/simple_llama.config
new file mode 100644
index 0000000000000..ef3193a8cc735
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/simple_llama.config
@@ -0,0 +1,252 @@
+{
+    (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"embedding_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[32000,4096],stop_gradient:[true]} : () -> pd_op.tensor<32000x4096xf16>
+    (%1) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> pd_op.tensor<4096xf16>
+    (%2) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
+    (%3) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_1.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
+    (%4) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_2.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
+    (%5) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"eager_tmp_1",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1,2048,1,128],stop_gradient:[true]} : () -> pd_op.tensor<1x2048x1x128xf32>
+    (%6) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"eager_tmp_2",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1,2048,1,128],stop_gradient:[true]} : () -> pd_op.tensor<1x2048x1x128xf32>
+    (%7) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_3.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
+    (%8) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_1.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> pd_op.tensor<4096xf16>
+    (%9) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_4.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,11008],stop_gradient:[true]} : () -> pd_op.tensor<4096x11008xf16>
+    (%10) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_5.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,11008],stop_gradient:[true]} : () -> pd_op.tensor<4096x11008xf16>
+    (%11) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_6.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[11008,4096],stop_gradient:[true]} : () -> pd_op.tensor<11008x4096xf16>
+    (%12) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_2.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> pd_op.tensor<4096xf16>
+    (%13) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"llama_lm_head_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,32000],stop_gradient:[true]} : () -> pd_op.tensor<4096x32000xf16>
+    (%14) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"top_p",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[true]} : () -> pd_op.tensor<1xf32>
+    (%15) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"position_ids",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> pd_op.tensor<-1x-1xi64>
+    (%16) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"attention_mask",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> pd_op.tensor<-1x-1xi64>
+    (%17) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"input_ids",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> pd_op.tensor<-1x-1xi64>
+    (%18) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
+    (%19) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%20) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%21) = "pd_op.slice" (%18, %19, %20) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%22) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%23) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%24) = "pd_op.slice" (%18, %22, %23) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%25) = "pd_op.cast" (%24) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<i32>) -> pd_op.tensor<i64>
+    (%26) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%27) = "pd_op.full_with_tensor" (%26, %25) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<i64>) -> pd_op.tensor<1xi64>
+    (%28) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
+    (%29) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%30) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%31) = "pd_op.slice" (%28, %29, %30) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%32) = "pd_op.cast" (%31) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<i32>) -> pd_op.tensor<i64>
+    (%33) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%34) = "pd_op.full_with_tensor" (%33, %32) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<i64>) -> pd_op.tensor<1xi64>
+    (%35) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<i32>
+    (%36) = "builtin.combine" (%21, %35) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>]
+    (%37) = "pd_op.stack" (%36) {axis:(Int32)0,stop_gradient:[true]} : (vec[pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<2xi32>
+    (%38) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%39) = "pd_op.full_with_tensor" (%37, %38) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xb>
+    (%40) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<i32>
+    (%41) = "builtin.combine" (%21, %40) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>]
+    (%42) = "pd_op.stack" (%41) {axis:(Int32)0,stop_gradient:[true]} : (vec[pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<2xi32>
+    (%43) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
+    (%44) = "pd_op.full_with_tensor" (%42, %43) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xf16>
+    (%45) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
+    (%46) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%47) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%48) = "pd_op.slice" (%45, %46, %47) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%49) = "pd_op.embedding" (%17, %0) {is_persistable:[false],padding_idx:(Int64)-1,sparse:false,stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<32000x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%50) = "pd_op.shape" (%16) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
+    (%51) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%52) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%53) = "pd_op.slice" (%50, %51, %52) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%54) = "pd_op.shape" (%16) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
+    (%55) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%56) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%57) = "pd_op.slice" (%54, %55, %56) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%58) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1,(Int64)2]} : () -> pd_op.tensor<2xi64>
+    (%59, %60) = "pd_op.unsqueeze" (%16, %58) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<2xi64>) -> pd_op.tensor<-1x1x1x-1xi64>, <<NULL TYPE>>
+    (%61) = "pd_op.cast" (%59) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x1x-1xi64>) -> pd_op.tensor<-1x1x1x-1xb>
+    (%62) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<1xi32>
+    (%63) = "builtin.combine" (%53, %62, %48, %57) {} : (pd_op.tensor<i32>, pd_op.tensor<1xi32>, pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]
+    (%64) = "pd_op.expand" (%61, %63) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x1x-1xb>, vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<-1x1x-1x-1xb>
+    (%65) = "pd_op.full" () {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)0} : () -> pd_op.tensor<1xf64>
+    (%66) = "pd_op.full" () {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)-65504} : () -> pd_op.tensor<1xf64>
+    (%67) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
+    (%68) = "pd_op.full_like" (%65, %67) {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<1xf32>) -> pd_op.tensor<1xf64>
+    (%69) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
+    (%70) = "pd_op.full_like" (%66, %69) {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<1xf32>) -> pd_op.tensor<1xf64>
+    (%71) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
+    (%72) = "pd_op.full_like" (%64, %71) {dtype:(pd_op.DataType)bool,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1x-1x-1xb>
+    (%73) = "pd_op.cast" (%72) {dtype:(pd_op.DataType)float64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%74) = "pd_op.cast" (%64) {dtype:(pd_op.DataType)float64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%75) = "pd_op.add" (%68, %70) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<1xf64>) -> pd_op.tensor<1xf64>
+    (%76) = "pd_op.add" (%75, %73) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%77) = "pd_op.add" (%65, %76) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%78) = "pd_op.add" (%66, %76) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%79) = "pd_op.add" (%74, %76) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%80) = "pd_op.cast" (%79) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xb>
+    (%81) = "pd_op.where" (%80, %77, %78) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>, pd_op.tensor<-1x1x-1x-1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
+    (%82) = "pd_op.cast" (%81) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf16>
+    (%83) = "pd_op.cast" (%49) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%84) = "pd_op.pow" (%83) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%85) = "pd_op.mean" (%84) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%86) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%87) = "pd_op.scale" (%85, %86) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%88) = "pd_op.rsqrt" (%87) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%89) = "pd_op.multiply" (%88, %83) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%90) = "pd_op.cast" (%89) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%91) = "pd_op.multiply" (%90, %1) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%92) = "pd_op.matmul" (%91, %2) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%93) = "pd_op.matmul" (%91, %3) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%94) = "pd_op.matmul" (%91, %4) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%95) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> pd_op.tensor<4xi64>
+    (%96, %97) = "pd_op.reshape" (%92, %95) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4xi64>) -> pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<0x-1x-1x4096xf16>
+    (%98) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> pd_op.tensor<4xi64>
+    (%99, %100) = "pd_op.reshape" (%93, %98) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4xi64>) -> pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<0x-1x-1x4096xf16>
+    (%101) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> pd_op.tensor<4xi64>
+    (%102, %103) = "pd_op.reshape" (%94, %101) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4xi64>) -> pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<0x-1x-1x4096xf16>
+    (%104) = "pd_op.shape" (%99) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<4xi32>
+    (%105) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%106) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%107) = "pd_op.slice" (%104, %105, %106) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%108) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%109) = "builtin.combine" (%107) {} : (pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>]
+    (%110) = "pd_op.slice" (%5, %108, %109) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x2048x1x128xf32>, pd_op.tensor<1xi64>, vec[pd_op.tensor<i32>]) -> pd_op.tensor<1x-1x1x128xf32>
+    (%111) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%112) = "builtin.combine" (%107) {} : (pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>]
+    (%113) = "pd_op.slice" (%6, %111, %112) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x2048x1x128xf32>, pd_op.tensor<1xi64>, vec[pd_op.tensor<i32>]) -> pd_op.tensor<1x-1x1x128xf32>
+    (%114) = "pd_op.cast" (%110) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x-1x1x128xf32>) -> pd_op.tensor<1x-1x1x128xf16>
+    (%115) = "pd_op.cast" (%113) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x-1x1x128xf32>) -> pd_op.tensor<1x-1x1x128xf16>
+    (%116) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)2]} : () -> pd_op.tensor<2xi64>
+    (%117, %118) = "pd_op.squeeze" (%114, %116) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<1x-1x1x128xf16>, pd_op.tensor<2xi64>) -> pd_op.tensor<-1x128xf16>, <<NULL TYPE>>
+    (%119) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)2]} : () -> pd_op.tensor<2xi64>
+    (%120, %121) = "pd_op.squeeze" (%115, %119) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<1x-1x1x128xf16>, pd_op.tensor<2xi64>) -> pd_op.tensor<-1x128xf16>, <<NULL TYPE>>
+    (%122) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> pd_op.tensor<1xi64>
+    (%123, %124) = "pd_op.unsqueeze" (%15, %122) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1xi64>, <<NULL TYPE>>
+    (%125) = "pd_op.gather_nd" (%117, %123) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x128xf16>, pd_op.tensor<-1x-1x1xi64>) -> pd_op.tensor<-1x-1x128xf16>
+    (%126) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%127, %128) = "pd_op.unsqueeze" (%125, %126) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x128xf16>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1x128xf16>, <<NULL TYPE>>
+    (%129) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> pd_op.tensor<1xi64>
+    (%130, %131) = "pd_op.unsqueeze" (%15, %129) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1xi64>, <<NULL TYPE>>
+    (%132) = "pd_op.gather_nd" (%120, %130) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x128xf16>, pd_op.tensor<-1x-1x1xi64>) -> pd_op.tensor<-1x-1x128xf16>
+    (%133) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%134, %135) = "pd_op.unsqueeze" (%132, %133) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x128xf16>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1x128xf16>, <<NULL TYPE>>
+    (%136) = "pd_op.multiply" (%96, %127) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%137) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%138) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
+    (%139) = "pd_op.slice" (%96, %137, %138) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
+    (%140) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
+    (%141) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
+    (%142) = "pd_op.slice" (%96, %140, %141) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
+    (%143) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xf32>
+    (%144) = "pd_op.scale" (%142, %143) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x32x64xf16>
+    (%145) = "builtin.combine" (%144, %139) {} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<-1x-1x32x64xf16>) -> vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>]
+    (%146) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xi32>
+    (%147) = "pd_op.concat" (%145, %146) {is_persistable:[false],stop_gradient:[false]} : (vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>], pd_op.tensor<1xi32>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%148) = "pd_op.multiply" (%147, %134) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%149) = "pd_op.add" (%136, %148) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%150) = "pd_op.multiply" (%99, %127) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%151) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%152) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
+    (%153) = "pd_op.slice" (%99, %151, %152) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
+    (%154) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
+    (%155) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
+    (%156) = "pd_op.slice" (%99, %154, %155) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
+    (%157) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xf32>
+    (%158) = "pd_op.scale" (%156, %157) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x32x64xf16>
+    (%159) = "builtin.combine" (%158, %153) {} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<-1x-1x32x64xf16>) -> vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>]
+    (%160) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xi32>
+    (%161) = "pd_op.concat" (%159, %160) {is_persistable:[false],stop_gradient:[false]} : (vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>], pd_op.tensor<1xi32>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%162) = "pd_op.multiply" (%161, %134) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%163) = "pd_op.add" (%150, %162) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%164) = "pd_op.shape" (%149) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<4xi32>
+    (%165) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%166) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%167) = "pd_op.slice" (%164, %165, %166) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%168) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%169) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%170) = "pd_op.slice" (%164, %168, %169) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%171) = "pd_op.shape" (%102) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<4xi32>
+    (%172) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%173) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
+    (%174) = "pd_op.slice" (%171, %172, %173) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%175) = "pd_op.transpose" (%149) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
+    (%176) = "pd_op.transpose" (%163) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
+    (%177) = "pd_op.transpose" (%102) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
+    (%178) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0.0883883} : () -> pd_op.tensor<1xf32>
+    (%179) = "pd_op.scale" (%175, %178) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x128xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x32x-1x128xf16>
+    (%180) = "pd_op.transpose" (%176) {is_persistable:[false],perm:[(Int32)0,(Int32)1,(Int32)3,(Int32)2],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x128xf16>) -> pd_op.tensor<-1x32x128x-1xf16>
+    (%181) = "pd_op.matmul" (%179, %180) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x32x-1x128xf16>, pd_op.tensor<-1x32x128x-1xf16>) -> pd_op.tensor<-1x32x-1x-1xf16>
+    (%182) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<1xi32>
+    (%183) = "builtin.combine" (%167, %182, %170, %174) {} : (pd_op.tensor<i32>, pd_op.tensor<1xi32>, pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]
+    (%184, %185) = "pd_op.reshape" (%82, %183) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x1x-1x-1xf16>, vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<-1x1x-1x-1xf16>, pd_op.tensor<0x-1x1x-1x-1xf16>
+    (%186) = "pd_op.add" (%181, %184) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf16>, pd_op.tensor<-1x1x-1x-1xf16>) -> pd_op.tensor<-1x32x-1x-1xf16>
+    (%187) = "pd_op.cast" (%186) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf16>) -> pd_op.tensor<-1x32x-1x-1xf32>
+    (%188) = "pd_op.softmax" (%187) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf32>) -> pd_op.tensor<-1x32x-1x-1xf32>
+    (%189) = "pd_op.cast" (%188) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf32>) -> pd_op.tensor<-1x32x-1x-1xf16>
+    (%190) = "pd_op.matmul" (%189, %177) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x32x-1x-1xf16>, pd_op.tensor<-1x32x-1x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
+    (%191) = "pd_op.transpose" (%190) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
+    (%192) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)4096} : () -> pd_op.tensor<1xi32>
+    (%193) = "builtin.combine" (%167, %170, %192) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>, pd_op.tensor<1xi32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>,pd_op.tensor<1xi32>]
+    (%194, %195) = "pd_op.reshape" (%191, %193) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x32x128xf16>, vec[pd_op.tensor<i32>,pd_op.tensor<i32>,pd_op.tensor<1xi32>]) -> pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<0x-1x-1x32x128xf16>
+    (%196) = "pd_op.matmul" (%194, %7) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%197) = "pd_op.add" (%49, %196) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%198) = "pd_op.cast" (%197) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%199) = "pd_op.pow" (%198) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%200) = "pd_op.mean" (%199) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%201) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%202) = "pd_op.scale" (%200, %201) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%203) = "pd_op.rsqrt" (%202) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%204) = "pd_op.multiply" (%203, %198) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%205) = "pd_op.cast" (%204) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%206) = "pd_op.multiply" (%205, %8) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%207) = "pd_op.matmul" (%206, %9) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x11008xf16>) -> pd_op.tensor<-1x-1x11008xf16>
+    (%208) = "pd_op.matmul" (%206, %10) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x11008xf16>) -> pd_op.tensor<-1x-1x11008xf16>
+    (%209) = "pd_op.swiglu" (%207, %208) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x11008xf16>, pd_op.tensor<-1x-1x11008xf16>) -> pd_op.tensor<-1x-1x11008xf16>
+    (%210) = "pd_op.matmul" (%209, %11) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x11008xf16>, pd_op.tensor<11008x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%211) = "pd_op.add" (%197, %210) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%212) = "pd_op.cast" (%211) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%213) = "pd_op.pow" (%212) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%214) = "pd_op.mean" (%213) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%215) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%216) = "pd_op.scale" (%214, %215) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%217) = "pd_op.rsqrt" (%216) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>) -> pd_op.tensor<-1x-1x1xf32>
+    (%218) = "pd_op.multiply" (%217, %212) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
+    (%219) = "pd_op.cast" (%218) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%220) = "pd_op.multiply" (%219, %12) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
+    (%221) = "pd_op.matmul" (%220, %13) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x32000xf16>) -> pd_op.tensor<-1x-1x32000xf16>
+    (%222) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> pd_op.tensor<1xi64>
+    (%223) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
+    (%224) = "pd_op.slice" (%221, %222, %223) {axes:[(Int64)1],decrease_axis:[(Int64)1],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32000xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x32000xf16>
+    (%225) = "pd_op.softmax" (%224) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>) -> pd_op.tensor<-1x32000xf16>
+    (%226) = "pd_op.log_softmax" (%224) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>) -> pd_op.tensor<-1x32000xf16>
+    (%227) = "pd_op.shape" (%225) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>) -> pd_op.tensor<2xi32>
+    (%228) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
+    (%229) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
+    (%230) = "pd_op.slice" (%227, %228, %229) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
+    (%231) = "pd_op.cast" (%14) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf32>) -> pd_op.tensor<1xf16>
+    (%232) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<i32>
+    (%233) = "builtin.combine" (%230, %232) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>]
+    (%234) = "pd_op.stack" (%233) {axis:(Int32)0,stop_gradient:[true]} : (vec[pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<2xi32>
+    (%235) = "pd_op.full_with_tensor" (%234, %231) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xf16>) -> pd_op.tensor<-1x1xf16>
+    (%236, %237) = "pd_op.top_p_sampling" (%225, %235, <<NULL VALUE>>) {is_persistable:[false,false],seed:(Int32)-1,stop_gradient:[false,false]} : (pd_op.tensor<-1x32000xf16>, pd_op.tensor<-1x1xf16>, <<NULL TYPE>>) -> pd_op.tensor<-1x1xf16>, pd_op.tensor<-1x1xi64>
+    (%238) = "pd_op.index_sample" (%226, %237) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>, pd_op.tensor<-1x1xi64>) -> pd_op.tensor<-1x1xf16>
+    (%239) = "pd_op.subtract" (%27, %34) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<1xi64>
+    (%240) = "pd_op.cast" (%239) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>) -> pd_op.tensor<1xf16>
+    (%241) = "pd_op.multiply" (%44, %240) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<1xf16>) -> pd_op.tensor<-1x1xf16>
+    (%242) = "pd_op.add" (%241, %238) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<-1x1xf16>) -> pd_op.tensor<-1x1xf16>
+    (%243) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%244) = "pd_op.scale" (%239, %243) {bias:(Float)1,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<1xf32>) -> pd_op.tensor<1xi64>
+    (%245) = "pd_op.cast" (%244) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>) -> pd_op.tensor<1xf16>
+    (%246) = "pd_op.divide" (%242, %245) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<1xf16>) -> pd_op.tensor<-1x1xf16>
+    (%247) = "pd_op.where" (%39, %246, %44) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xb>, pd_op.tensor<-1x1xf16>, pd_op.tensor<-1x1xf16>) -> pd_op.tensor<-1x1xf16>
+    (%248) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
+    (%249) = "pd_op.full_like" (%237, %248) {dtype:(pd_op.DataType)int64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<-1x1xi64>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xi64>
+    (%250) = "pd_op.where" (%39, %237, %249) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xb>, pd_op.tensor<-1x1xi64>, pd_op.tensor<-1x1xi64>) -> pd_op.tensor<-1x1xi64>
+    (%251) = "builtin.combine" (%17, %250) {} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<-1x1xi64>) -> vec[pd_op.tensor<-1x-1xi64>,pd_op.tensor<-1x1xi64>]
+    (%252) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xi32>
+    (%253) = "pd_op.concat" (%251, %252) {is_persistable:[false],stop_gradient:[false]} : (vec[pd_op.tensor<-1x-1xi64>,pd_op.tensor<-1x1xi64>], pd_op.tensor<1xi32>) -> pd_op.tensor<-1x-1xi64>
+    (%254) = "builtin.combine" (%31) {} : (pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>]
+    (%255) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
+    (%256) = "pd_op.slice" (%253, %254, %255) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>, vec[pd_op.tensor<i32>], pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1xi64>
+    (%257) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%258) = "pd_op.scale" (%256, %257) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1xi64>
+    (%259) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
+    (%260) = "pd_op.scale" (%247, %259) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xf16>
+    (%261) = "pd_op.fetch" (%258) {col:(Int32)0,name:"save_infer_model/scale_0.tmp_0"} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<-1x-1xi64>
+    (%262) = "pd_op.fetch" (%260) {col:(Int32)1,name:"save_infer_model/scale_1.tmp_0"} : (pd_op.tensor<-1x1xf16>) -> pd_op.tensor<-1x1xf16>
+}
diff --git a/test/ir/pir/cinn/symbolic/test_simple_llama_dy.py b/test/ir/pir/cinn/symbolic/test_simple_llama_dy.py
new file mode 100644
index 0000000000000..b23818368f30b
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_simple_llama_dy.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.base import core
+from paddle.base.data_feeder import convert_dtype
+
+np.random.seed(2024)
+
+
+class ProgramInfo:
+    def __init__(self, program, feeds, fetchs):
+        self.program = program
+        # {name: [shape, dtype]}
+        self.feeds = feeds
+        # {name: shape}
+        self.fetchs = fetchs
+
+    def random_feeds(self):
+        feed_dict = {}
+        for name, info in self.feeds.items():
+            data = np.random.uniform(low=-0.5, high=0.5, size=info[0]).astype(
+                convert_dtype(info[1])
+            )
+            feed_dict[name] = data
+
+        return feed_dict
+
+    def fetch_list(self):
+        return list(self.fetchs.keys())
+
+
+class Parser:
+    def __init__(self):
+        self.feed_op_name = 'pd_op.data'
+        self.fetch_op_name = 'pd_op.fetch'
+        self.have_dy_shape = False
+
+    def run(self, file):
+        program = self.load_from(file)
+        for op in program.global_block().ops:
+            if op.name() == "pd_op.reshape":
+                if (
+                    op.result(1).initialized()
+                    and not op.result(1).use_empty()
+                    and op.result(1).first_use().owner().name() == "pd_op.fetch"
+                ):
+                    program.global_block().remove_op(
+                        op.result(1).first_use().owner()
+                    )
+
+            if op.name() == "pd_op.squeeze":
+                if (
+                    op.result(1).initialized()
+                    and not op.result(1).use_empty()
+                    and op.result(1).first_use().owner().name() == "pd_op.fetch"
+                ):
+                    program.global_block().remove_op(
+                        op.result(1).first_use().owner()
+                    )
+
+            if op.name() == "pd_op.unsqueeze":
+                if (
+                    op.result(1).initialized()
+                    and not op.result(1).use_empty()
+                    and op.result(1).first_use().owner().name() == "pd_op.fetch"
+                ):
+                    program.global_block().remove_op(
+                        op.result(1).first_use().owner()
+                    )
+
+            if (
+                op.name() == "pd_op.batch_norm_"
+                or op.name() == "pd_op.batch_norm"
+            ):
+                if (
+                    op.result(5).initialized()
+                    and not op.result(5).use_empty()
+                    and op.result(5).first_use().owner().name() == "pd_op.fetch"
+                ):
+                    program.global_block().remove_op(
+                        op.result(5).first_use().owner()
+                    )
+
+        feeds = self.parse_feeds(program)
+        fetchs = self.parse_fetchs(program)
+
+        return ProgramInfo(program, feeds, fetchs)
+
+    def load_from(self, file):
+        with open(file, 'r') as f:
+            content = f.read()
+
+        return paddle.pir.parse_program(content)
+
+    def parse_feeds(self, program):
+        feeds = {}
+        for op in program.global_block().ops:
+            if op.name() == self.feed_op_name:
+                in_val = op.result(0)
+                # shape, dtype
+                shape = []
+                for s in in_val.shape:
+                    if s == -1:
+                        s = 1
+                        self.have_dy_shape = True
+                    shape.append(s)
+                info = [shape, in_val.dtype]
+                feeds[op.attrs()['name']] = info
+
+        return feeds
+
+    def parse_fetchs(self, program):
+        fetchs = {}
+        for op in program.global_block().ops:
+            if op.name() == self.fetch_op_name:
+                in_val = op.operand_source(0)
+                fetchs[op.attrs()['name']] = in_val.shape
+
+        return fetchs
+
+
+class TestTask(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.file_path = os.path.join(file_dir, args.file_path)
+
+    def test_phi(self):
+        self.check_infer(enable_cinn=False)
+
+    def test_llama_eval(self):
+        parser = Parser()
+        program_info = parser.run(self.file_path)
+
+        feed = program_info.random_feeds()
+        fetch_list = program_info.fetch_list()
+
+        base_out = self.run_program(program_info.program, feed, fetch_list)
+
+        cinn_out = self.run_program(
+            program_info.program,
+            feed,
+            fetch_list,
+            enable_cinn=False,
+            prim_mode=True,
+        )
+
+        for cinn_res, base_res in zip(cinn_out, base_out):
+            np.testing.assert_allclose(cinn_res, base_res, atol=5e-3, rtol=5e-3)
+
+    def check_infer(self, enable_cinn):
+        parser = Parser()
+        program_info = parser.run(self.file_path)
+        if not parser.have_dy_shape:
+            feed = program_info.random_feeds()
+            fetch_list = program_info.fetch_list()
+
+            return self.run_program(
+                program_info.program, feed, fetch_list, enable_cinn
+            )
+
+    def run_program(
+        self, program, feed, fetch_list, enable_cinn=False, prim_mode=False
+    ):
+        if prim_mode:
+            core._set_prim_forward_enabled(True)
+            paddle.decomposition.decomp.decompose(program, [])
+            core._set_prim_forward_enabled(False)
+        if enable_cinn:
+            fwd_pm = paddle.base.libpaddle.pir.PassManager()
+            paddle.base.libpaddle.pir.add_cinn_pass(fwd_pm, program)
+            fwd_pm.run(program)
+
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+        outs = exe._run_pir_impl(
+            program,
+            feed=feed,
+            fetch_list=fetch_list,
+            feed_var_name="feed",
+            fetch_var_name='fetch',
+            scope=None,
+            return_numpy=True,
+        )
+        return outs
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--file_path',
+        default="simple_llama.config",
+        help='input file',
+        dest='file_path',
+    )
+    parser.add_argument('unittest_args', nargs='*')
+    args = parser.parse_args()
+    sys.argv[1:] = args.unittest_args
+    unittest.main()

From 5859683678591106b3df649950993a59bbcf575b Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 1 Mar 2024 14:34:12 +0800
Subject: [PATCH 53/55] pir onednn elemetwise datalayout trans (#62265)

---
 .../instruction/onednn/onednn_instruction.cc  | 68 +++++++++++--------
 .../instruction/onednn/onednn_instruction.h   |  2 +
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
index aa3df67535747..923d745b49d68 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
@@ -245,16 +245,16 @@ OneDNNPhiKernelInstruction::OneDNNPhiKernelInstruction(
   }
   VLOG(6) << "finish process infer meta context";
 
-  auto kernel_name =
+  auto kernel_name_ =
       op_attributes.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
-  auto kernel_key = op_attributes.at("kernel_key")
-                        .dyn_cast<paddle::dialect::KernelAttribute>()
-                        .data();
+  auto kernel_key_ = op_attributes.at("kernel_key")
+                         .dyn_cast<paddle::dialect::KernelAttribute>()
+                         .data();
 
   phi_kernel_ = new phi::Kernel(
-      phi::KernelFactory::Instance().SelectKernel(kernel_name, kernel_key));
+      phi::KernelFactory::Instance().SelectKernel(kernel_name_, kernel_key_));
   PADDLE_ENFORCE_EQ(
-      phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
+      phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name_);
   VLOG(6) << "finish process select kernel";
 
   BuildPhiContext<phi::KernelContext,
@@ -266,13 +266,13 @@ OneDNNPhiKernelInstruction::OneDNNPhiKernelInstruction(
       op, *value_exec_info_, yaml_info_parser, &kernel_context_);
 
   kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get(
-      phi::TransToPhiPlace(kernel_key.backend())));
+      phi::TransToPhiPlace(kernel_key_.backend())));
   VLOG(6) << "finish process kernel context";
 
   SetDeviceContext(
       ParseDeviceContext(op,
                          phi::DeviceContextPool::Instance().Get(
-                             phi::TransToPhiPlace(kernel_key.backend())),
+                             phi::TransToPhiPlace(kernel_key_.backend())),
                          place,
                          GetExecutionStream(),
                          GetStreamPriority()));
@@ -409,28 +409,42 @@ void OneDNNPhiKernelInstruction::Run() {
     VLOG(6) << "input[" << i << "].layout() = " << input->layout();
     if (input->layout() != phi::DataLayout::ONEDNN) {
       phi::DataLayout from_layout = input->layout();
-
-      //  Handle 'layout_transform' in
-      //  ops_onednn_extra.yaml(GetKernelTypeForVar)
-      if (data_format_tensors_.count(i) &&
-          input_layout_ != phi::DataLayout::kAnyLayout) {
-        from_layout = input_layout_;
-      }
-      VLOG(6) << "from_layout = " << from_layout;
-
       auto transed_tensor = const_cast<phi::DenseTensor*>(input);
 
-      if (from_layout == DataLayout::kNHWC ||
-          from_layout == DataLayout::kNDHWC) {
-        phi::funcs::MatchShapeToLayout(
-            transed_tensor, from_layout, phi::DataLayout::ONEDNN);
-        // We register only NHWC assuming that model is consistent e.g. either
-        // NHWC or NCHW
-        phi::OneDNNContext::tls().set_cur_paddle_data_layout(from_layout);
-      }
+      std::set<std::string> elementwise_kernels = {
+          "add", "subtract", "multiply", "divide"};
+      if (elementwise_kernels.count(kernel_name_)) {
+        if (phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
+                phi::DataLayout::kNHWC &&
+            !(kernel_key_.dtype() == phi::DataType::COMPLEX64 ||
+              kernel_key_.dtype() == phi::DataType::COMPLEX128)) {
+          phi::funcs::MatchShapeToLayout(
+              transed_tensor, from_layout, phi::DataLayout::ONEDNN);
+          from_layout = phi::DataLayout::kNHWC;
+        } else {
+          continue;
+        }
+      } else {
+        //  Handle 'layout_transform' in
+        //  ops_onednn_extra.yaml(GetKernelTypeForVar)
+        if (data_format_tensors_.count(i) &&
+            input_layout_ != phi::DataLayout::kAnyLayout) {
+          from_layout = input_layout_;
+        }
+        VLOG(6) << "from_layout = " << from_layout;
+
+        if (from_layout == DataLayout::kNHWC ||
+            from_layout == DataLayout::kNDHWC) {
+          phi::funcs::MatchShapeToLayout(
+              transed_tensor, from_layout, phi::DataLayout::ONEDNN);
+          // We register only NHWC assuming that model is consistent e.g. either
+          // NHWC or NCHW
+          phi::OneDNNContext::tls().set_cur_paddle_data_layout(from_layout);
+        }
 
-      if (from_layout == DataLayout::kAnyLayout) {
-        from_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout();
+        if (from_layout == DataLayout::kAnyLayout) {
+          from_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout();
+        }
       }
 
       dnnl::memory::desc out_mem_desc =
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
index cae045044ed3c..7f8058e4c5488 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
@@ -75,6 +75,8 @@ class OneDNNPhiKernelInstruction : public InstructionBase {
   std::map<std::string, phi::Attribute> ctx_attr_{};
   std::map<std::string, std::vector<std::string>> inputs_{};
   std::map<std::string, std::vector<std::string>> outputs_{};
+  std::string kernel_name_;
+  phi::KernelKey kernel_key_;
 };
 }  // namespace framework
 }  // namespace paddle

From 7ea78b62b9f6c2ff72230453b5ad0505a641e625 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Fri, 1 Mar 2024 15:10:05 +0800
Subject: [PATCH 54/55] [Prim] Fix need_skip and refine eager_gen.py (#62083)

* fix need_skip and refine eager_gen.py

* add code annotations

* remove redundant need_skip when not has_higher_order_node

* simplify eager_gen.py

* simplify eager_gen.py

* fix bug in _gen_api_call_code_block
---
 .../generator/eager_gen.py                    | 353 +++++++++++-------
 1 file changed, 213 insertions(+), 140 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 74fc6b9a7dbc6..e17109f5a352a 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -242,7 +242,7 @@ class {} : public egr::GradNodeBase {{
   VLOG(6) << "gradnode_ptr = " << this;
   // LOG IF DEBUG
 
-  {}
+{}
   // Return
 {}
 }}
@@ -296,25 +296,25 @@ class {} : public egr::GradNodeBase {{
 
   VLOG(4) << \"Finish AD API: {}";
   // LOG IF DEBUG
-  {}
+{}
   // Returns
   return {};
 }}
 """
 
 AFTER_LOG_PRINT_TEMPLATE = """
-  if(VLOG_IS_ON(4)){{
-      const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s],  \\n Output: [%s] }} \";
-      {}
-      VLOG(4) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str, output_str);
+  if (VLOG_IS_ON(4)) {{
+    const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s],  \\n Output: [%s] }} \";
+{}
+    VLOG(4) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str, output_str);
   }}
 """
 
 BEFORE_LOG_PRINT_TEMPLATE = """
-  if(VLOG_IS_ON(3)){{
-      const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s]}} \";
-      {}
-      VLOG(3) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str);
+  if (VLOG_IS_ON(3)) {{
+    const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s]}} \";
+{}
+    VLOG(3) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str);
   }}
 """
 
@@ -346,13 +346,13 @@ class {} : public egr::GradNodeBase {{
   // Check Inplace if needed
 {}{}
   // LOG IF DEBUG
-  {}
+{}
   // Returns
   return {};
 }}
 """
 
-FORWARD_BODY_BEFORE_API_CALL_TEMPLATE = """  if(require_any_grad) {{
+FORWARD_BODY_BEFORE_API_CALL_TEMPLATE = """  if (require_any_grad) {{
 {}
     // Node Construction
 {}
@@ -367,7 +367,7 @@ class {} : public egr::GradNodeBase {{
   }}
 """
 
-FORWARD_BODY_AFTER_API_CALL_TEMPLATE = """  if(require_any_grad) {{
+FORWARD_BODY_AFTER_API_CALL_TEMPLATE = """  if (require_any_grad) {{
 
     egr::EagerUtils::PassStopGradient({});
 
@@ -382,7 +382,7 @@ class {} : public egr::GradNodeBase {{
   }}
 """
 
-HIGHER_ORDER_DERIVATIVE_VALUE_TEMPLATE = """  if(trace_backward) {{
+HIGHER_ORDER_DERIVATIVE_VALUE_TEMPLATE = """  if (trace_backward) {{
 {}
     // Node Construction
 {}
@@ -562,12 +562,12 @@ class {} : public egr::GradNodeBase {{
 
 CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE = """
   paddle::optional<paddle::Tensor> {}_optional;
-  if( {}.impl() ) {}_optional = paddle::make_optional<paddle::Tensor>({});
+  if ({}.impl()) {}_optional = paddle::make_optional<paddle::Tensor>({});
 """
 
 CREATE_RECOVER_OPTIONAL_VECTOR_TENSOR_TEMPLATE = """
   paddle::optional<std::vector<paddle::Tensor>> {}_optional;
-  if( !{}.empty() ) {}_optional = paddle::make_optional<std::vector<paddle::Tensor>>({});
+  if (!{}.empty()) {}_optional = paddle::make_optional<std::vector<paddle::Tensor>>({});
 """
 
 SET_GRAD_OUT_DIST_ATTR_TEMPLATE = """
@@ -593,20 +593,20 @@ class {} : public egr::GradNodeBase {{
 
 CHECK_NAN_AND_INF_TEMPLATE_FORWARD = """
   if (FLAGS_check_nan_inf) {{
-      egr::CheckTensorHasNanOrInf("{}", {});
+    egr::CheckTensorHasNanOrInf("{}", {});
   }}
 """
 
 CHECK_NAN_AND_INF_TEMPLATE_BACKWARD = """
   if (FLAGS_check_nan_inf) {{
-     try{{
-       egr::CheckTensorHasNanOrInf("{}", {});
-     }} catch(...) {{
-       LOG(WARNING) << "There are nan/inf in ({})";
-       auto forward_trace = GetForwardTrace();
-       std::cout<<forward_trace<<std::endl;
-       std::rethrow_exception(std::current_exception());
-     }}
+    try{{
+      egr::CheckTensorHasNanOrInf("{}", {});
+    }} catch(...) {{
+      LOG(WARNING) << "There are nan/inf in ({})";
+      auto forward_trace = GetForwardTrace();
+      std::cout<<forward_trace<<std::endl;
+      std::rethrow_exception(std::current_exception());
+    }}
   }}
 """
 
@@ -752,7 +752,7 @@ def __init__(
 
     def ParseBackwardInplaceInfo(self):
         grad_api_contents = self.grad_api_contents
-        if 'inplace' not in grad_api_contents.keys():
+        if 'inplace' not in grad_api_contents:
             return
 
         inplace_map_str = grad_api_contents['inplace']
@@ -762,28 +762,26 @@ def DygraphYamlValidationCheck(self):
         forward_api_contents = self.forward_api_contents
         grad_api_contents = self.grad_api_contents
 
+        assert 'op' in forward_api_contents, "Unable to find \"op\" in ops.yaml"
         assert (
-            'op' in forward_api_contents.keys()
-        ), "Unable to find \"op\" in ops.yaml"
-        assert (
-            'args' in forward_api_contents.keys()
+            'args' in forward_api_contents
         ), "Unable to find \"args\" in ops.yaml"
         assert (
-            'output' in forward_api_contents.keys()
+            'output' in forward_api_contents
         ), "Unable to find \"output\" in ops.yaml"
 
         if grad_api_contents is not None:
             assert (
-                'backward' in forward_api_contents.keys()
+                'backward' in forward_api_contents
             ), "Unable to find \"backward\" in ops.yaml"
             assert (
-                'args' in grad_api_contents.keys()
+                'args' in grad_api_contents
             ), "Unable to find \"args\" in backward.yaml"
             assert (
-                'output' in grad_api_contents.keys()
+                'output' in grad_api_contents
             ), "Unable to find \"output\" in backward.yaml"
             assert (
-                'forward' in grad_api_contents.keys()
+                'forward' in grad_api_contents
             ), "Unable to find \"forward\" in backward.yaml"
 
     def ForwardsValidationCheck(self):
@@ -942,7 +940,7 @@ def SlotNameMatching(self):
             if backward_fwd_name:
                 # Grad Input
                 assert (
-                    backward_fwd_name in forward_outputs_position_map.keys()
+                    backward_fwd_name in forward_outputs_position_map
                 ), AssertMessage(
                     backward_fwd_name, forward_outputs_position_map.keys()
                 )
@@ -960,7 +958,7 @@ def SlotNameMatching(self):
                 ]
             else:
                 # TensorWrapper Input
-                if backward_input_name in forward_inputs_position_map.keys():
+                if backward_input_name in forward_inputs_position_map:
                     tensor_wrapper_type = forward_inputs_position_map[
                         backward_input_name
                     ][0]
@@ -970,7 +968,7 @@ def SlotNameMatching(self):
                         backward_input_pos,
                     ]
 
-                elif backward_input_name in forward_outputs_position_map.keys():
+                elif backward_input_name in forward_outputs_position_map:
                     tensor_wrapper_type = forward_outputs_position_map[
                         backward_input_name
                     ][0]
@@ -994,7 +992,7 @@ def SlotNameMatching(self):
                 backward_fwd_name is not None
             ), f"Detected {backward_fwd_name} = None"
             assert (
-                backward_fwd_name in forward_inputs_position_map.keys()
+                backward_fwd_name in forward_inputs_position_map
             ), AssertMessage(
                 backward_fwd_name, forward_inputs_position_map.keys()
             )
@@ -1040,8 +1038,8 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
         )
 
         # Node Construction
-        num_backward_inputs = len(forward_outputs_position_map.keys())
-        num_backward_outputs = len(forward_inputs_position_map.keys())
+        num_backward_inputs = len(forward_outputs_position_map)
+        num_backward_outputs = len(forward_inputs_position_map)
         grad_node_name = GetGradNodeName(self.backward_api_name)
         self.grad_node_name = grad_node_name
 
@@ -1075,21 +1073,19 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
         # SetTensorWrappers
         set_input_tensor_wrappers_list = []
         set_output_tensor_wrappers_list = []
-        num_fwd_outputs = len(forward_outputs_position_map.keys())
+        num_fwd_outputs = len(forward_outputs_position_map)
         for name, (
             atype,
             is_fwd_input,
             pos,
         ) in backward_forward_inputs_map.items():
             is_optional = name in optional_inputs
-            is_inplace_input = (
-                is_inplaced and name in self.forward_inplace_map.keys()
-            )
+            is_inplace_input = is_inplaced and name in self.forward_inplace_map
 
             if is_fwd_input:
                 if is_optional:
                     if is_inplace_input:
-                        set_tensor_wrappers = """{indent}if({name}) {
+                        set_tensor_wrappers = """{indent}if ({name}) {
                                                             auto {name}_clone = paddle::experimental::assign({name});
                                                             grad_node->SetTensorWrapper_{name}(*{name}_clone);}""".format_map(
                             {"indent": indent, "name": name}
@@ -1102,13 +1098,13 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                             or (name in self.optional_inputs)
                         ):
                             if for_backward is False:
-                                set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper_{name}(*{name});"
+                                set_tensor_wrappers = f"{indent}if ({name}) grad_node->SetTensorWrapper_{name}(*{name});"
                             else:
-                                set_tensor_wrappers = f"{indent}if({name}_optional) grad_node->SetTensorWrapper_{name}(*{name}_optional);"
+                                set_tensor_wrappers = f"{indent}if ({name}_optional) grad_node->SetTensorWrapper_{name}(*{name}_optional);"
 
                         else:
                             need_pre_contiguous_set.add(name)
-                            set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper_{name}(*{name}_tmp);"
+                            set_tensor_wrappers = f"{indent}if ({name}) grad_node->SetTensorWrapper_{name}(*{name}_tmp);"
                 else:
                     if is_inplace_input:
                         set_tensor_wrappers = f"{indent}auto {name}_clone = paddle::experimental::assign({name});\n{indent}grad_node->SetTensorWrapper_{name}({name}_clone);"
@@ -1127,9 +1123,9 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
             else:  # Forwad's output as backward's input
                 if num_fwd_outputs > 1:
                     # Aligned with forward output position
-                    assert (
-                        name in forward_outputs_position_map.keys()
-                    ), AssertMessage(name, forward_outputs_position_map.keys())
+                    assert name in forward_outputs_position_map, AssertMessage(
+                        name, forward_outputs_position_map.keys()
+                    )
 
                 set_tensor_wrappers = (
                     f"{indent}grad_node->SetTensorWrapper_{name}({name});"
@@ -1185,9 +1181,9 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
 
             if is_optional:
                 if for_backward is False:
-                    set_grad_out_meta = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
+                    set_grad_out_meta = f"{indent}if ({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
                 else:
-                    set_grad_out_meta = f"{indent}if({name}_optional.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}_optional.get_ptr()), {pos});"
+                    set_grad_out_meta = f"{indent}if ({name}_optional.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}_optional.get_ptr()), {pos});"
             else:
                 if (
                     is_special_forward_api
@@ -1209,7 +1205,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
         set_out_rank_list = []
         set_history_list = []
         set_grad_in_meta_list = []
-        num_outputs = len(forward_outputs_position_map.keys())
+        num_outputs = len(forward_outputs_position_map)
         for name, (_, pos) in forward_outputs_position_map.items():
             output_autograd_meta_name = GetAutoGradMetaName(name)
             set_out_rank = f"""{indent}if ({output_autograd_meta_name}) {{
@@ -1358,7 +1354,7 @@ def GenerateForwardLayoutAutotune(
         intermediate_outputs = self.intermediate_outputs
         forward_attrs_list = self.forward_attrs_list
         forward_outputs_position_map = self.forward_outputs_position_map
-        num_outputs = len(forward_outputs_position_map.keys()) - len(
+        num_outputs = len(forward_outputs_position_map) - len(
             intermediate_outputs
         )
         # for layout autotune attr
@@ -1481,9 +1477,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         indent = GetIndent(1)
 
         # Get Function Args
-        num_inputs = len(forward_attrs_list) + len(
-            forward_inputs_position_map.keys()
-        )
+        num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map)
         inputs_args_definition_list = ["" for i in range(num_inputs)]
         inputs_args_declaration_list = ["" for i in range(num_inputs)]
         inputs_call_list = ["" for i in range(num_inputs)]
@@ -1512,7 +1506,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         self.is_forward_only
                         and is_inplaced
                         and forward_inplace_map
-                        and name in forward_inplace_map.keys()
+                        and name in forward_inplace_map
                     ):
                         arg_str = f"paddle::optional<paddle::Tensor>& {name}"
                     else:
@@ -1535,7 +1529,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                     if (
                         is_inplaced
                         and forward_inplace_map
-                        and name in forward_inplace_map.keys()
+                        and name in forward_inplace_map
                     ):
                         arg_str = f"paddle::Tensor& {name}"
                         amp_tensors_vector_list.append(f"{{{name}}}")
@@ -1558,7 +1552,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         self.is_forward_only
                         and is_inplaced
                         and forward_inplace_map
-                        and name in forward_inplace_map.keys()
+                        and name in forward_inplace_map
                     ):
                         arg_str = f"paddle::optional<std::vector<paddle::Tensor>>& {name}"
                     else:
@@ -1576,7 +1570,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                     if (
                         is_inplaced
                         and forward_inplace_map
-                        and name in forward_inplace_map.keys()
+                        and name in forward_inplace_map
                     ):
                         arg_str = f"std::vector<paddle::Tensor>& {name}"
                     else:
@@ -1623,7 +1617,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         if is_inplaced and len(forward_outputs_position_map) == 1:
             api_out_type = "auto&"
         forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
-        num_outputs = len(forward_outputs_position_map.keys()) - len(
+        num_outputs = len(forward_outputs_position_map) - len(
             intermediate_outputs
         )
 
@@ -1710,7 +1704,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             self.forward_api_name[-1] != '_'
             or self.forward_api_name == 'assign_out_'
         ):
-            for inplace_name in forward_inplace_map.keys():
+            for inplace_name in forward_inplace_map:
                 if (
                     not self.is_forward_only
                     and forward_api_name not in inplace_check_blacklist
@@ -1765,7 +1759,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
 
             # 2. Get Output AutoGradMeta
             outputs_autograd_meta_list = []
-            num_fwd_outputs = len(forward_outputs_position_map.keys())
+            num_fwd_outputs = len(forward_outputs_position_map)
 
             for name, (rtype, pos) in forward_outputs_position_map.items():
                 output_autograd_meta_name = GetAutoGradMetaName(name)
@@ -1882,13 +1876,13 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         for name, (ttype, pos) in forward_inputs_position_map.items():
             var_str += f"\n{indent}  const char* TENSOR_{name.upper()}_TEMPLATE = \" \\n( {name} , [%s]), \";"
             var_str += f"\n{indent}  std::string input_{name}_str = paddle::string::Sprintf(TENSOR_{name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({name}));"
-            var_str += f"\n{indent}  input_str += input_{name}_str; "
+            var_str += f"\n{indent}  input_str += input_{name}_str;"
 
         before_log_str = BEFORE_LOG_PRINT_TEMPLATE.format(var_str)
         for name, (ttype, pos) in forward_outputs_position_map.items():
             var_str += f"\n{indent}  const char* TENSOR_{name.upper()}_TEMPLATE = \" \\n( {name} , [%s]), \";"
             var_str += f"\n{indent}  std::string output_{name}_str = paddle::string::Sprintf(TENSOR_{name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({name}));"
-            var_str += f"\n{indent}  output_str += output_{name}_str; "
+            var_str += f"\n{indent}  output_str += output_{name}_str;"
 
         log_str = AFTER_LOG_PRINT_TEMPLATE.format(var_str)
 
@@ -1958,10 +1952,7 @@ def GenerateInplacedForwardDygraphFunctions(self):
         forward_api_name = self.forward_api_name
         forward_api_contents = self.forward_api_contents
 
-        if (
-            forward_api_name != "sum"
-            and "inplace" in forward_api_contents.keys()
-        ):
+        if forward_api_name != "sum" and "inplace" in forward_api_contents:
             # Function Definition and Declaration Generation
             self.GenerateForwardDefinitionAndDeclaration(is_inplaced=True)
             self.UpdateCoreOpsInformation(is_inplaced=True)
@@ -1976,10 +1967,8 @@ def UpdateCoreOpsInformation(self, is_inplaced):
         forward_outputs_position_map = self.forward_outputs_position_map
         forward_attrs_list = self.forward_attrs_list
 
-        num_args = len(forward_inputs_position_map.keys()) + len(
-            forward_attrs_list
-        )
-        num_returns = len(forward_outputs_position_map.keys())
+        num_args = len(forward_inputs_position_map) + len(forward_attrs_list)
+        num_returns = len(forward_outputs_position_map)
 
         fwd_api_name = "" + forward_api_name
         core_ops_returns_info[fwd_api_name] = ["" for i in range(num_returns)]
@@ -2042,7 +2031,7 @@ def __init__(
 
     def TransformToNextGradName(self, string):
         name_mapping = self.to_next_grad_name_mapping
-        if string in name_mapping.keys():
+        if string in name_mapping:
             return name_mapping[string]
         return string
 
@@ -2072,6 +2061,7 @@ def RecordGrad2NextGradNameMapping(self, next_node_generator):
             self.to_next_grad_name_mapping[grad_ret_name] = next_ret_name
 
     def GenerateHigherOrderNodeCreationCode(self):
+        indent = GetIndent(1)
         has_higher_order_node = False
         namespace = self.namespace
         grad_api_contents = self.grad_api_contents
@@ -2081,6 +2071,7 @@ def GenerateHigherOrderNodeCreationCode(self):
         next_grad_node_creation_str = ""
         next_grad_node_out_list = []
         next_node_generator = None
+
         if next_grad_api_contents:
             # Fake forward_api_contents and backward_api_contents
             forward_api_contents = grad_api_contents
@@ -2107,30 +2098,43 @@ def GenerateHigherOrderNodeCreationCode(self):
         is_composite_grad_api = (
             False if self.composite_func_info == {} else True
         )
-
         if is_composite_grad_api:
             if next_grad_node_creation_str != '':
+                next_grad_node_creation_str = [
+                    line if len(line) else line
+                    for line in next_grad_node_creation_str.split("\n")
+                ]
+                next_grad_node_creation_str = [
+                    (indent + line if i >= 1 and len(line) else line)
+                    for line in next_grad_node_creation_str
+                ]
+                next_grad_node_creation_str = [
+                    (indent + line if len(line) else line)
+                    for line in next_grad_node_creation_str
+                ]
+                next_grad_node_creation_str = "\n".join(
+                    next_grad_node_creation_str
+                )
                 next_grad_node_creation_str = f"""
- if (!paddle::prim::PrimCommonUtils::IsEagerPrimEnabled()) {{
-    {next_grad_node_creation_str}
- }}
-  """
+  if (!paddle::prim::PrimCommonUtils::IsEagerPrimEnabled() || need_skip) {{
+{next_grad_node_creation_str}
+  }}
+"""
             else:
                 if not (
                     self.grad_api_contents["backward_op"] in prim_white_list
                     or is_invoke_forward_api
                 ):
                     next_grad_node_creation_str = f"""
- if (!paddle::prim::PrimCommonUtils::IsEagerPrimEnabled()) {{
-    if(trace_backward) {{
-    PADDLE_THROW(phi::errors::Unavailable(
-    \"The Op {self.backward_api_name} doesn't have any grad\"
-    \"op. If you don't intend calculating higher order\"
-    \"derivatives, please set `create_graph`to False.\"));
+  if (!paddle::prim::PrimCommonUtils::IsEagerPrimEnabled() || need_skip) {{
+    if (trace_backward) {{
+       PADDLE_THROW(phi::errors::Unavailable(
+       \"The Op {self.backward_api_name} doesn't have any grad\"
+       \"op. If you don't intend calculating higher order\"
+       \"derivatives, please set `create_graph`to False.\"));
+    }}
   }}
- }}
-  """
-
+"""
         if next_node_generator is not None:
             has_higher_order_node = True
             return (
@@ -2143,7 +2147,7 @@ def GenerateHigherOrderNodeCreationCode(self):
             )
         # TODO(Ruting):Integrate invoke and composite as composite so the rest branch canbe covered
         elif not is_invoke_forward_api and not is_composite_grad_api:
-            next_grad_node_creation_str = f"""  if(trace_backward) {{
+            next_grad_node_creation_str = f"""  if (trace_backward) {{
     PADDLE_THROW(phi::errors::Unavailable(
     \"The Op {self.backward_api_name} doesn't have any grad\"
     \"op. If you don't intend calculating higher order\"
@@ -2273,8 +2277,8 @@ def GenerateNodeDefinition(
         # Construct grad_api function args
         # Order: TensorWrappers, GradTensors, Attributes
         grad_api_args_len = (
-            len(backward_forward_inputs_map.keys())
-            + len(backward_grad_inputs_map.keys())
+            len(backward_forward_inputs_map)
+            + len(backward_grad_inputs_map)
             + len(backward_attrs_list)
         )
         grad_api_args = ["" for i in range(grad_api_args_len)]
@@ -2325,7 +2329,7 @@ def GenerateNodeDefinition(
 
             is_optional = name in self.optional_inputs
             tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
-            if backward_inplace_map and name in backward_inplace_map.keys():
+            if backward_inplace_map and name in backward_inplace_map:
                 if has_higher_order_node:
                     if (
                         transformed_tensor_name
@@ -2401,7 +2405,7 @@ def GenerateNodeDefinition(
                 get_tensor_str = f"{indent}auto& {transformed_tensor_name} = hooked_grads[{fwd_position}][0];"
 
                 # Inplace in backward op
-                if backward_inplace_map and name in backward_inplace_map.keys():
+                if backward_inplace_map and name in backward_inplace_map:
                     if has_higher_order_node:
                         if (
                             transformed_tensor_name
@@ -2464,7 +2468,7 @@ def GenerateNodeDefinition(
         get_grad_in_args_str = "\n".join(get_grad_in_args_list)
 
         # Grad Function Call String
-        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
+        slot_num_bwd_outputs = len(self.forward_inputs_position_map)
         grad_api_namespace = f"paddle::experimental::{namespace}"
         composite_grad_api_namespace = f"paddle::prim::{namespace}"
         grad_function_prepare_str = f"""
@@ -2508,7 +2512,7 @@ def GenerateNodeDefinition(
                     backward_inplace_map
                     and name in backward_inplace_map.values()
                 ):
-                    inplace_str = f""" if (api_output_{out_index} != nullptr && can_be_inplaced) {{
+                    inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{
       egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
     }}"""
                     if has_higher_order_node:
@@ -2520,7 +2524,7 @@ def GenerateNodeDefinition(
   }}"""
                         need_gen_trace_backward_for_inplace = True
                     else:
-                        inplace_for_grad_outs_str += inplace_str
+                        inplace_for_grad_outs_str += "  " + inplace_str
 
                 grad_function_prepare_str += f"""
   auto* api_output_{out_index} = (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][0].IsStopGradient()) ? nullptr : &returns[{fwd_position}][0];"""
@@ -2570,43 +2574,112 @@ def GenerateNodeDefinition(
             grad_function_call_str = f"""
   if (trace_backward) {{
   {indent}{autograd_api_out} api_output = {autograd_api};
-  {out_assign_str}}} else {{
+  {out_assign_str}{indent}}} else {{
   {indent}{autograd_api_out} api_output = paddle::experimental::{self.namespace}{self.grad_api_contents['invoke']};
   {out_assign_str}{indent}}}
-  """
-        # TODO(Ruting):using composite only when we don't have backward kernel in the future.
+"""
         elif is_composite_grad_api:
-            if composite_grad_api_name in prim_white_list:
-                grad_function_call_str = f"""
+            has_kernel_impl = "kernel" in self.grad_api_contents
+
+            def _gen_api_call_code_block(
+                in_prim_white_list: bool,
+                has_kernel_impl: bool,
+                has_higher_order_node: bool,
+                indention: int,
+            ):
+                """This function will generate code block for calling composite or
+                kernel grad api as shown below.
+
+                // Call grad_api function
+
+                XXX <-- Generated code by this function
+                XXX <-- Generated code by this function
+                ... <-- Generated code by this function
+                ... <-- Generated code by this function
+
+                // Check NaN and Inf id needed
+
+                Args:
+                    in_prim_white_list (bool): Whether current op in `prim_white_list`.
+                    has_kernel_impl (bool): Whether current op has kernel implementation.
+                    has_higher_order_node (bool): Whether current op has next grad op.
+                    indention (int): Number of single space for whole code block indention.
+                """
+                if in_prim_white_list:
+                    code = f"""
+bool original_global_grad = egr::Controller::Instance().HasGrad();
+if (!create_graph) {{
+{indent}egr::Controller::Instance().SetHasGrad(create_graph);
+}}
+{composite_grad_api_namespace}{composite_grad_api_name}{composite_template_name}({composite_grad_api_args_str});
+VLOG(4) << "Composite api {composite_grad_api_name} is called";
+if (!create_graph) {{
+{indent}egr::Controller::Instance().SetHasGrad(original_global_grad);
+}}
+"""
+                    if has_higher_order_node:
+                        code = f"auto need_skip = false;{code}"
+                else:
+                    code = f"""
+std::string grad_op_name = "{composite_grad_api_name}";
+auto need_skip = paddle::prim::StaticCompositeContext::Instance().CheckSkipCompOps(grad_op_name);
+if (paddle::prim::PrimCommonUtils::IsEagerPrimEnabled() && !need_skip) {{
 {indent}bool original_global_grad = egr::Controller::Instance().HasGrad();
-{indent}if(!create_graph){{
+{indent}if (!create_graph) {{
 {indent}{indent}egr::Controller::Instance().SetHasGrad(create_graph);
-    }}
-  {indent}{composite_grad_api_namespace}{composite_grad_api_name}{composite_template_name}({composite_grad_api_args_str});
-  VLOG(4) << "Composite api {composite_grad_api_name} is called ";
-{indent}if(!create_graph){{
+{indent}}}
+{indent}{composite_grad_api_namespace}{composite_grad_api_name}{composite_template_name}({composite_grad_api_args_str});
+{indent}VLOG(4) << "Composite api {composite_grad_api_name} is called";
+{indent}if (!create_graph) {{
 {indent}{indent}egr::Controller::Instance().SetHasGrad(original_global_grad);
-    }}
-  """
+{indent}}}"""
+                    if has_kernel_impl:
+                        code = (
+                            code
+                            + f"""
+}} else {{
+{indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});
+{indent}VLOG(4) << "Fused api {backward_api_name} is called";
+}}
+"""
+                        )
+                    else:
+                        code = (
+                            code
+                            + f"""
+}} else {{
+  PADDLE_THROW(phi::errors::Unavailable(
+  \"The grad op of {self.backward_api_name} doesn't implemented yet.\"));
+}}
+"""
+                        )
+                # make indention for all line(s) in code
+                code = "\n".join(
+                    [
+                        (f"{' ' * indention}{line}" if len(line) else line)
+                        for line in code.split("\n")
+                    ]
+                )
+
+                return code
+
+            if (
+                self.backward_api_name not in prim_white_list
+                and not has_kernel_impl
+            ):
+                grad_function_call_str = _gen_api_call_code_block(
+                    self.backward_api_name in prim_white_list,
+                    has_kernel_impl,
+                    has_higher_order_node,
+                    0,
+                )
             else:
-                grad_function_call_str = f"""
-  std::string grad_op_name = "{composite_grad_api_name}";
-  auto need_skip = paddle::prim::StaticCompositeContext::Instance().CheckSkipCompOps(grad_op_name);
-  if (paddle::prim::PrimCommonUtils::IsEagerPrimEnabled() && !need_skip) {{
-{indent}bool original_global_grad = egr::Controller::Instance().HasGrad();
-{indent}if(!create_graph){{
-{indent}{indent}egr::Controller::Instance().SetHasGrad(create_graph);
-    }}
-  {indent}{composite_grad_api_namespace}{composite_grad_api_name}{composite_template_name}({composite_grad_api_args_str});
-  {indent}VLOG(4) << "Composite api {composite_grad_api_name} is called ";
-{indent}if(!create_graph){{
-{indent}{indent}egr::Controller::Instance().SetHasGrad(original_global_grad);
-    }}
-  }}else{{
-  {indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});
-  {indent}VLOG(4) << "Fused api {backward_api_name} is called ";
-  }}
-  """
+                grad_function_call_str = _gen_api_call_code_block(
+                    self.backward_api_name in prim_white_list,
+                    has_kernel_impl,
+                    has_higher_order_node,
+                    2,
+                )
         else:
             grad_function_call_str = f"""
 {indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});"""
@@ -2630,7 +2703,7 @@ def GenerateNodeDefinition(
         outputs_autograd_meta_list = []
         # TODO(jiabin): Optimize this with SetStopGradient instead of Pass Stop gradient
 
-        num_fwd_outputs = len(backward_grad_outputs_map.keys())
+        num_fwd_outputs = len(backward_grad_outputs_map)
         for name, (
             rtype,
             pos,
@@ -2649,7 +2722,7 @@ def GenerateNodeDefinition(
   auto& {transformed_tensor_name} = returns[{pos}][0];
   egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;
   if ({output_autograd_meta_name}) {output_autograd_meta_name}->SetStopGradient(false);
-  """
+"""
 
             else:
                 assert IsVectorTensorType(rtype)
@@ -2658,7 +2731,7 @@ def GenerateNodeDefinition(
     auto& {transformed_tensor_name} = returns[{pos}];
     std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
     std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
-    for(auto* meta : {output_autograd_meta_vec_name}){{
+    for(auto* meta : {output_autograd_meta_vec_name}) {{
         meta->SetStopGradient(false);
     }}
 """
@@ -2666,7 +2739,7 @@ def GenerateNodeDefinition(
                     output_autograd_meta = f"""
     auto& {transformed_tensor_name} = returns[{pos}];
     std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
-    for(auto* meta : {output_autograd_meta_vec_name}){{
+    for(auto* meta : {output_autograd_meta_vec_name}) {{
         meta->SetStopGradient(false);
     }}
 """
@@ -2674,7 +2747,7 @@ def GenerateNodeDefinition(
 
         outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
 
-        returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
+        returns_str = f"{indent}if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
         returns_str += f"{indent}return returns;\n"
 
         grad_node_name = GetGradNodeName(self.backward_api_name)
@@ -2689,7 +2762,7 @@ def GenerateNodeDefinition(
             new_name = self.TransformToNextGradName(name)
             var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \" \\n( {new_name} , [%s]), \";"
             var_str += f"\n{indent}  std::string input_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
-            var_str += f"\n{indent}  input_str += input_{new_name}_str; "
+            var_str += f"\n{indent}  input_str += input_{new_name}_str;"
 
         for (
             name,
@@ -2698,7 +2771,7 @@ def GenerateNodeDefinition(
             new_name = self.TransformToNextGradName(name)
             var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \" \\n( {new_name} , [%s]), \";"
             var_str += f"\n{indent}  std::string input_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
-            var_str += f"\n{indent}  input_str += input_{new_name}_str; "
+            var_str += f"\n{indent}  input_str += input_{new_name}_str;"
 
         before_log_str = BEFORE_LOG_PRINT_TEMPLATE.format(var_str)
 
@@ -2710,7 +2783,7 @@ def GenerateNodeDefinition(
             new_name = self.TransformToNextGradName(name)
             var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \" \\n ( {new_name} , [%s]), \";"
             var_str += f"\n{indent}  std::string output_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
-            var_str += f"\n{indent}  output_str += output_{new_name}_str; "
+            var_str += f"\n{indent}  output_str += output_{new_name}_str;"
 
         log_str = AFTER_LOG_PRINT_TEMPLATE.format(var_str)
 
@@ -2787,7 +2860,7 @@ def __init__(
 
     def CollectIsForwardOnly(self, forward_api_contents):
         self.is_forward_only = (
-            False if 'backward' in forward_api_contents.keys() else True
+            False if 'backward' in forward_api_contents else True
         )
 
     def ParseYamlContents(self):
@@ -2802,11 +2875,11 @@ def ParseYamlContents(self):
     def GetBackwardAPIContents(self, forward_api_contents):
         grad_api_dict = self.grad_api_dict
 
-        if 'backward' not in forward_api_contents.keys():
+        if 'backward' not in forward_api_contents:
             return None
 
         backward_api_name = forward_api_contents['backward']
-        assert backward_api_name in grad_api_dict.keys(), AssertMessage(
+        assert backward_api_name in grad_api_dict, AssertMessage(
             backward_api_name, grad_api_dict.keys()
         )
         backward_api_contents = grad_api_dict[backward_api_name]

From e5404f0cc58dd12f547ea8176177829dc203c43e Mon Sep 17 00:00:00 2001
From: zhengzhonghui <zhengzhonghui@baidu.com>
Date: Fri, 1 Mar 2024 16:00:25 +0800
Subject: [PATCH 55/55] [AutoParallel] shard_dataloader support list inputs
 (#62229)

* [AutoParallel] shard_dataloader support list inputs

* add an example

* fix doc example error

* add doc

* fix

* fix

* fix doc
---
 .../paddle/distributed/auto_parallel/api.py   | 195 +++++++++++++---
 .../hybrid_strategy/CMakeLists.txt            |   8 +
 .../semi_auto_parallel_multi_inputs.py        | 212 ++++++++++++++++++
 .../test_semi_auto_parallel_multi_inputs.py   |  57 +++++
 .../hybrid_strategy/testslist.csv             |   1 +
 5 files changed, 448 insertions(+), 25 deletions(-)
 create mode 100644 test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py
 create mode 100644 test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_multi_inputs.py

diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index 28f15011190f2..c63f8ce3a58c9 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -2018,22 +2018,22 @@ def __init__(
                     process_id, self._meshes
                 )
             )
+        if input_keys is not None:
+            assert len(input_keys) == 2, "input_keys lengths must be 2"
 
         self._all_inputs_in_one_mesh = len(self._meshes) == 1
         self._input_keys = input_keys
         self._shard_dims = self._process_shard_dims(shard_dims)
 
-        mesh_index = self._get_mesh_idx(process_id)
-        if mesh_index == -1:
+        mesh, shard_dim = self._get_mesh_and_shard_dim(process_id)
+        if mesh is None:
+            mesh = to_list(self._meshes[0])[0]
+            shard_dim = to_list(self._shard_dims[0])[0]
             dp_rank = 0
-            dp_world_size = self._meshes[0].get_dim_size(self._shard_dims[0])
+            dp_world_size = mesh.get_dim_size(shard_dim)
         else:
-            dp_rank = self._meshes[mesh_index].get_rank_by_dim_and_process_id(
-                self._shard_dims[mesh_index], process_id
-            )
-            dp_world_size = self._meshes[mesh_index].get_dim_size(
-                self._shard_dims[mesh_index]
-            )
+            dp_rank = mesh.get_rank_by_dim_and_process_id(shard_dim, process_id)
+            dp_world_size = mesh.get_dim_size(shard_dim)
 
         if is_dataset_splitted is True or shard_dims is None:
             self._dataloader = dataloader
@@ -2074,7 +2074,13 @@ def __init__(
 
     def _process_shard_dims(self, shard_dims):
         if isinstance(shard_dims, (int, str)) or shard_dims is None:
-            return [shard_dims] * len(self._meshes)
+            res = []
+            for i in range(len(self._meshes)):
+                if isinstance(self._meshes[i], (list, tuple)):
+                    res.append([shard_dims] * len(self._meshes[i]))
+                else:
+                    res.append(shard_dims)
+            return res
         else:
             if len(shard_dims) != len(self._meshes):
                 raise ValueError(
@@ -2084,16 +2090,30 @@ def _process_shard_dims(self, shard_dims):
                 )
             return shard_dims
 
-    def _get_mesh_idx(self, process_id):
+    def _get_mesh_and_shard_dim(self, process_id):
         for i in range(len(self._meshes)):
-            if process_id in self._meshes[i]._process_ids:
-                return i
-        return -1
+            if isinstance(self._meshes[i], (list, tuple)):
+                for j in range(len(self._meshes[i])):
+                    if process_id in self._meshes[i][j]._process_ids:
+                        return self._meshes[i][j], self._shard_dims[i][j]
+            else:
+                if process_id in self._meshes[i]._process_ids:
+                    return self._meshes[i], self._shard_dims[i]
+        return None, None
 
     def _process_id_in_multi_meshes(self, process_id):
         count = 0
-        for i in range(len(self._meshes)):
-            if process_id in self._meshes[i]._process_ids:
+        flatten_meshes = []
+        for mesh in self._meshes:
+            if isinstance(mesh, (list, tuple)):
+                flatten_meshes.extend(mesh)
+            else:
+                flatten_meshes.append(mesh)
+
+        # NOTE(zhengzhonghui): User may set the same mesh for different inputs, so we need to unique the meshes
+        unique_meshes = list(set(flatten_meshes))
+        for mesh in unique_meshes:
+            if process_id in mesh._process_ids:
                 count += 1
         return count > 1
 
@@ -2123,16 +2143,69 @@ def _get_mesh_and_placement(self, index):
             placements.append(dist.Replicate())
         return mesh, placements
 
+    def _get_meshes_and_placements_for_list_input(self, index, length):
+        if self._all_inputs_in_one_mesh:
+            meshes = [self._meshes[0]] * length
+            shard_dims = [self._shard_dims[0]] * length
+        else:
+            meshes = self._meshes[index]
+            if isinstance(meshes, (list, tuple)):
+                assert len(meshes) == length
+            else:
+                meshes = [meshes] * length
+            shard_dims = self._shard_dims[index]
+            if isinstance(shard_dims, (list, tuple)):
+                assert len(shard_dims) == length
+            else:
+                shard_dims = [shard_dims] * length
+
+        placements = []
+        for i in range(length):
+            if shard_dims[i] is not None:
+                placement = [dist.Shard(0)]
+            else:
+                placement = [dist.Replicate()]
+            for _ in range(1, len(meshes[i]._shape)):
+                placement.append(dist.Replicate())
+            placements.append(placement)
+        return meshes, placements
+
+    def _dtensors_from_list_input(self, list_tensors, meshes, placements):
+        dist_data = []
+        for j in range(len(list_tensors)):
+            dist_data.append(
+                dtensor_from_local(list_tensors[j], meshes[j], placements[j])
+            )
+        return dist_data
+
     def _get_batch(self, batch_data):
         if isinstance(batch_data, (list, tuple)):
             if self._all_inputs_in_one_mesh is False:
                 assert len(batch_data) == len(self._meshes)
             dist_batch_data = []
             for i in range(len(batch_data)):
-                mesh, placements = self._get_mesh_and_placement(i)
-                dist_batch_data.append(
-                    dtensor_from_local(batch_data[i], mesh, placements)
-                )
+                input_data = batch_data[i]
+                if isinstance(input_data, (list, tuple)):
+                    (
+                        meshes,
+                        placements,
+                    ) = self._get_meshes_and_placements_for_list_input(
+                        i, len(input_data)
+                    )
+                    dist_batch_data.append(
+                        self._dtensors_from_list_input(
+                            input_data, meshes, placements
+                        )
+                    )
+                elif isinstance(input_data, paddle.Tensor):
+                    mesh, placements = self._get_mesh_and_placement(i)
+                    dist_batch_data.append(
+                        dtensor_from_local(input_data, mesh, placements)
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported input_data type {type(input_data)}"
+                    )
             return dist_batch_data
         elif isinstance(batch_data, dict):
             if self._all_inputs_in_one_mesh is False:
@@ -2140,10 +2213,26 @@ def _get_batch(self, batch_data):
             dist_batch_data = {}
             for i in range(len(self._input_keys)):
                 key = self._input_keys[i]
-                mesh, placements = self._get_mesh_and_placement(i)
-                dist_batch_data[key] = dtensor_from_local(
-                    batch_data[key], mesh, placements
-                )
+                input_data = batch_data[key]
+                if isinstance(input_data, (list, tuple)):
+                    (
+                        meshes,
+                        placements,
+                    ) = self._get_meshes_and_placements_for_list_input(
+                        i, len(input_data)
+                    )
+                    dist_batch_data[key] = self._dtensors_from_list_input(
+                        input_data, meshes, placements
+                    )
+                elif isinstance(input_data, paddle.Tensor):
+                    mesh, placements = self._get_mesh_and_placement(i)
+                    dist_batch_data[key] = dtensor_from_local(
+                        batch_data[key], mesh, placements
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported input_data type {type(input_data)}"
+                    )
             return dist_batch_data
         else:
             raise ValueError(f"Unsupported batch_data type {type(batch_data)}")
@@ -2173,7 +2262,9 @@ def shard_dataloader(
     only if is_dataset_splitted is False and shard_dims is not None, it will do split.
 
     Args:
-        dataloader (paddle.io.DataLoader): The dataloader to be sharded.
+        dataloader (paddle.io.DataLoader): The dataloader to be sharded. the output of dataloader
+            must be a list or dict of paddle.Tensor with 2 elements, i.e. [input_data, label] or
+            {"input_data": input_data, "label": label}, input_data and label can be a list to support multiple inputs.
         meshes (ProcessMesh|list[ProcessMesh]|tuple[ProcessMesh]): The mesh list of the dataloader.
             Identify which mesh the input is on. if len(meshes) == 1 or type(meshes) == ProcessMesh,
             all the inputs are on the same mesh.
@@ -2191,6 +2282,7 @@ def shard_dataloader(
 
     Examples:
         .. code-block:: python
+            :name: example-1
 
             >>> import paddle
             >>> import paddle.distributed as dist
@@ -2286,6 +2378,59 @@ def shard_dataloader(
             >>> # RUN_STATIC=1 python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" {test_case}.py
             >>> # RUN_STATIC=0 python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" {test_case}.py
 
+        .. code-block:: python
+            :name: example-2
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+            >>> from paddle.io import BatchSampler, DataLoader, Dataset
+            >>> import numpy as np
+            >>> mesh0 = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=['dp', 'mp'])
+            >>> mesh1 = dist.ProcessMesh([[4, 5], [6, 7]], dim_names=['dp', 'mp'])
+            >>> class RandomDataset(Dataset):
+            ...     def __init__(self, seq_len, hidden, num_samples=8):
+            ...         super().__init__()
+            ...         self.seq_len = seq_len
+            ...         self.hidden = hidden
+            ...         self.num_samples = num_samples
+            ...         self.inputs1 = [
+            ...             np.random.uniform(size=[self.seq_len, self.hidden]).astype(
+            ...                 "float32"
+            ...             )
+            ...             for _ in range(num_samples)
+            ...         ]
+            ...         self.inputs2 = [
+            ...             np.random.uniform(size=[self.seq_len, self.hidden]).astype(
+            ...                 "float32"
+            ...             )
+            ...             for _ in range(num_samples)
+            ...         ]
+            ...         self.labels = [
+            ...             np.array(index, dtype="float32") for index in range(num_samples)
+            ...         ]
+            ...     def __getitem__(self, index):
+            ...         return {
+            ...             "inputs": [self.inputs1[index], self.inputs2[index]],
+            ...             "label": self.labels[index],
+            ...         }
+            ...     def __len__(self):
+            ...         return self.num_samples
+
+            >>> dataset = RandomDataset(4, 8)
+            >>> sampler = BatchSampler(
+            ...     dataset,
+            ...     batch_size=2,
+            ... )
+            >>> dataloader = DataLoader(
+            ...     dataset,
+            ...     batch_sampler=sampler,
+            ... )
+            >>> dist_dataloader = dist.shard_dataloader(
+            ...     dataloader=dataloader,
+            ...     meshes=[mesh0, mesh1],  # or [[mesh0, mesh0], mesh1]
+            ...     shard_dims="dp",
+            ...     input_keys=["inputs", "label"],
+            ... )
     """
 
     return ShardDataloader(
diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
index 08a9f42c02a1f..063b1b5873e74 100644
--- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt
+++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
@@ -73,3 +73,11 @@ if((WITH_GPU) AND (LINUX))
   set_tests_properties(test_semi_auto_parallel_global_input
                        PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=HYBRID")
 endif()
+if((WITH_GPU) AND (LINUX))
+  py_test_modules(
+    test_semi_auto_parallel_multi_inputs MODULES
+    test_semi_auto_parallel_multi_inputs ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_semi_auto_parallel_multi_inputs
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=HYBRID")
+endif()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py
new file mode 100644
index 0000000000000..a7166ca901d09
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_multi_inputs.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle.io import BatchSampler, DataLoader, Dataset
+
+SEQ_LEN = 4
+HIDDLE_SIZE = 8
+global_mesh = dist.ProcessMesh(
+    [[[0, 1], [2, 3]], [[4, 5], [6, 7]]], dim_names=['pp', 'dp', 'mp']
+)
+mesh0 = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=['dp', 'mp'])
+mesh1 = dist.ProcessMesh([[4, 5], [6, 7]], dim_names=['dp', 'mp'])
+
+
+class MlpModel(paddle.nn.Layer):
+    def __init__(self, variable_initial_values, run_single_process=False):
+        super().__init__()
+        self.w0 = self.create_parameter(
+            shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+            default_initializer=paddle.nn.initializer.Assign(
+                variable_initial_values[0]
+            ),
+        )
+        self.w1 = self.create_parameter(
+            shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+            default_initializer=paddle.nn.initializer.Assign(
+                variable_initial_values[1]
+            ),
+        )
+        if run_single_process is False:
+            self.w0 = dist.shard_tensor(
+                self.w0,
+                mesh0,
+                [dist.Replicate(), dist.Shard(1)],
+            )
+            self.w1 = dist.shard_tensor(
+                self.w1,
+                mesh1,
+                [dist.Replicate(), dist.Shard(0)],
+            )
+        self.run_single_process = run_single_process
+
+    def forward(self, input1, input2):
+        x = input1 + input2
+        # x: [bs, seq_len, hidden]
+        # forward on mesh0
+        y = paddle.matmul(x, self.w0)
+        # forward on mesh1
+        if self.run_single_process is False:
+            y = dist.reshard(y, mesh1, [dist.Shard(0), dist.Shard(2)])
+        z = paddle.matmul(y, self.w1)
+        return z
+
+
+class RandomDataset(Dataset):
+    def __init__(self, seq_len, hidden, num_samples=8):
+        super().__init__()
+        self.seq_len = seq_len
+        self.hidden = hidden
+        self.num_samples = num_samples
+        self.inputs1 = [
+            np.random.uniform(size=[self.seq_len, self.hidden]).astype(
+                "float32"
+            )
+            for _ in range(num_samples)
+        ]
+        self.inputs2 = [
+            np.random.uniform(size=[self.seq_len, self.hidden]).astype(
+                "float32"
+            )
+            for _ in range(num_samples)
+        ]
+        self.labels = [
+            np.array(index, dtype="float32") for index in range(num_samples)
+        ]
+
+    def __getitem__(self, index):
+        return {
+            "inputs": [self.inputs1[index], self.inputs2[index]],
+            "label": self.labels[index],
+        }
+
+    def __len__(self):
+        return self.num_samples
+
+
+def create_dataloader():
+    dataset = RandomDataset(SEQ_LEN, HIDDLE_SIZE)
+    sampler = BatchSampler(
+        dataset,
+        batch_size=2,
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_sampler=sampler,
+    )
+    return dataloader
+
+
+def get_variable_initial_value(var_num=2):
+    res = []
+    for i in range(var_num):
+        res.append(
+            paddle.uniform(
+                shape=[HIDDLE_SIZE, HIDDLE_SIZE],
+                dtype=paddle.float32,
+                min=-0.0001,
+                max=0.0001,
+            )
+        )
+    return res
+
+
+def loss_fn(logits, label):
+    # logits: [bs, seq_len, hidden], label: [bs]
+    loss = paddle.nn.MSELoss(reduction="sum")
+    logits = paddle.sum(logits, axis=[1, 2])
+    return loss(logits, label)
+
+
+class TestSemiAutoParallelMultiInputs:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._run_static = eval(os.getenv("run_static"))
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+        paddle.set_device(self._backend)
+        self.dataloader = create_dataloader()
+        self.variable_initial_values = get_variable_initial_value()
+        self.single_process_loss = self.get_single_process_loss()
+
+    def get_single_process_loss(self):
+        model = MlpModel(
+            variable_initial_values=self.variable_initial_values,
+            run_single_process=True,
+        )
+        opt = paddle.optimizer.AdamW(
+            learning_rate=0.001, parameters=model.parameters()
+        )
+        for step, data in enumerate(self.dataloader()):
+            input1, input2 = data["inputs"]
+            logits = model(input1, input2)
+            label = data["label"]
+            loss = loss_fn(logits, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        return loss.numpy()
+
+    def test_basic(self):
+        model = MlpModel(variable_initial_values=self.variable_initial_values)
+        opt = paddle.optimizer.AdamW(
+            learning_rate=0.001, parameters=model.parameters()
+        )
+        dist_dataloader = dist.shard_dataloader(
+            dataloader=self.dataloader,
+            meshes=[mesh0, mesh1],  # or [[mesh0, mesh0], mesh1]
+            shard_dims="dp",
+            input_keys=["inputs", "label"],
+        )
+        cur_rank = paddle.distributed.get_rank()
+        if self._run_static:
+            dist_model = dist.to_static(model, dist_dataloader, loss_fn, opt)
+
+            for step, data in enumerate(dist_dataloader()):
+                input1, input2 = data["inputs"]
+                label = data["label"]
+                loss = dist_model(input1, input2, label)
+
+            if cur_rank in [5, 7]:
+                loss = paddle.to_tensor(loss)
+                group = paddle.distributed.new_group([5, 7])
+                dist.all_reduce(loss, group=group)
+        else:
+            dist_opt = dist.shard_optimizer(opt)
+            for step, data in enumerate(dist_dataloader()):
+                input1, input2 = data["inputs"]
+                logits = model(input1, input2)
+                label = data["label"]
+                loss = loss_fn(logits, label)
+                loss.backward()
+                dist_opt.step()
+                dist_opt.clear_grad()
+        if cur_rank in [5, 7]:
+            np.testing.assert_allclose(
+                loss.numpy(), self.single_process_loss, rtol=1e-06, verbose=True
+            )
+
+    def run_test_case(self):
+        self.test_basic()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelMultiInputs().run_test_case()
diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_multi_inputs.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_multi_inputs.py
new file mode 100644
index 0000000000000..e172ba1da70f5
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_multi_inputs.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelMultiInputs(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=8,
+            timeout=120,
+            nnode=1,
+        )
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "1024",
+        }
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_dynamic(self):
+        self._default_envs.update({"run_static": "0"})
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_multi_inputs.py",
+                user_defined_envs=envs,
+            )
+
+    def test_static(self):
+        self._default_envs.update({"run_static": "1"})
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_multi_inputs.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/hybrid_strategy/testslist.csv b/test/auto_parallel/hybrid_strategy/testslist.csv
index 5791b71d0d5ff..2fac60515b51a 100644
--- a/test/auto_parallel/hybrid_strategy/testslist.csv
+++ b/test/auto_parallel/hybrid_strategy/testslist.csv
@@ -8,3 +8,4 @@ test_semi_auto_parallel_llama_model_amp,LINUX,GPU,180,HYBRID,test_runner.py,,,ht
 test_semi_auto_parallel_hybrid_sharding_strategy,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_global_mesh_reshard,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_semi_auto_parallel_global_input,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_semi_auto_parallel_multi_inputs,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,