ROCm · causten · Aug 21, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
@@ -144,7 +144,7 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
     }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
     stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot', 'MIGRAPHX_ENABLE_MLIR_INPUT_FUSION=1', 'MIGRAPHX_MLIR_ENABLE_SPLITK=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot', 'MIGRAPHX_ENABLE_MLIR_INPUT_FUSION=1', 'MIGRAPHX_MLIR_ENABLE_SPLITK=1', 'MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION=1', 'MIGRAPHX_ENABLE_SPLIT_REDUCE=1','MIGRAPHX_DISABLE_LAYERNORM_FUSION=1']) {
             def sanitizers = "undefined"
             // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
             def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"

@@ -278,6 +278,11 @@ Limits the number of solutions available to MLIR for tuning.
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable input fusions in MLIR.
 
+.. envvar:: MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Enable reduction fusions in MLIR.
+
 .. envvar:: MIGRAPHX_MLIR_ENABLE_SPLITK
 
 Set to "1", "enable", "enabled", "yes", or "true" to use.

@@ -259,6 +259,15 @@ struct MIGRAPHX_EXPORT module
          const std::vector<instruction_ref>& inputs,
          std::unordered_map<instruction_ref, instruction_ref>* map_ins = nullptr,
          inserter insert                                               = nullptr);
+    /*
+    Insert instructions from module `m` to this module at position `ins`
+    */
+    std::vector<instruction_ref>
+    insert_inline(instruction_ref ins,
+                  const module& m,
+                  const std::vector<instruction_ref>& inputs,
+                  std::unordered_map<instruction_ref, instruction_ref>* map_ins = nullptr,
+                  inserter insert                                               = nullptr);
 
     void debug_print() const;
     void debug_print(instruction_ref ins) const;

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal

@@ -1041,14 +1041,29 @@
     if(map_ins == nullptr)
         map_ins = &default_map_ins;
     insert_params(*this, inputs, *map_ins);
-    auto param_map = m.get_ins_param_map(inputs);
-    for(auto&& [input, param] : param_map)
+    auto param_map = m.get_ins_param_map(inputs, true);
+    for(auto&& [param, input] : param_map)
 copy_ins    = m.add_parameter(name, s); 
 copy_ins    = m.add_parameter(name, s); 
     {
         (*map_ins)[param] = map_ins->at(input);
     }
     return this->add_instructions(&m, map_ins, std::move(insert));
 }
 
+std::vector<instruction_ref>
+module::insert_inline(instruction_ref ins,
+                      const module& m,
+                      const std::vector<instruction_ref>& inputs,
+                      std::unordered_map<instruction_ref, instruction_ref>* map_ins,
+                      module::inserter insert)
+{
+    std::unordered_map<instruction_ref, instruction_ref> default_map_ins;
+    if(map_ins == nullptr)
+        map_ins = &default_map_ins;
+    auto param_map = m.get_ins_param_map(inputs, true);
+    map_ins->insert(param_map.begin(), param_map.end());
+    return this->insert_instructions(ins, &m, map_ins, std::move(insert));
+}
+
 void module_with_inputs::replace(instruction_ref ins, instruction_ref rep)
 {
     auto it = std::find(inputs.begin(), inputs.end(), ins);

@@ -43,6 +43,7 @@ namespace gpu {
 
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_EXTRA_MLIR);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR_INPUT_FUSION);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_MLIR);
 /**
  * @brief Declares a new MIGraphX environment variable which forces to generate
@@ -386,13 +387,59 @@ bool is_pointwise_op_supported_by_mlir(const instruction& i)
     return false;
 }
 
+bool is_reduce_op_supported_by_mlir(const instruction& i)
+{
+    using type_t                                      = shape::type_t;
+    const auto& name                                  = i.name();
+    const auto result_type                            = i.get_shape().type();
+    const std::initializer_list<type_t> allowed_types = {
+        type_t::float_type, type_t::half_type, type_t::fp8e4m3fnuz_type};
+    // Preliminary type check.
+    if(not contains(allowed_types, result_type))
+    {
+        return false;
+    }
+    const std::initializer_list<std::string> reduce_ops = {"reduce_mean", "reduce_sum"};
+    return contains(reduce_ops, i.name());
+}
+
 // A separate function so we can remove operators that are supported by mlir
 // but not supported for an input fusion.
 bool is_pointwise_op_supported_by_mlir_for_input(const instruction& i)
 {
     return is_pointwise_op_supported_by_mlir(i);
 }
 
+MIGRAPHX_PRED_MATCHER(mlir_split_reduce, instruction_ref ins)
+{
+    if(ins->name() != "split_fused_reduce")
+        return false;
+    auto* mod_arg           = ins->module_inputs().front();
+    auto supported_reshapes = reshaper_names();
+    supported_reshapes.erase("slice");
+    std::unordered_set<std::string> builtins = {"@param", "@literal", "@return"};
+    for(const auto i : iterator_for(*mod_arg))
+    {
+        if(is_reduce(*i))
+        {
+            if(not is_reduce_op_supported_by_mlir(*i))
+                return false;
+        }
+        else if(i->name() == "pointwise")
+        {
+            if(not std::all_of(i->module_inputs().front()->begin(),
+                               i->module_inputs().front()->end(),
+                               &is_pointwise_op_supported_by_mlir))
+                return false;
+        }
+        else if(not contains(reshaper_names(), i->name()) and not contains(builtins, i->name()))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
 MIGRAPHX_PRED_MATCHER(mlir_pointwise, instruction_ref ins)
 {
     if(ins->name() != "pointwise")
@@ -423,6 +470,90 @@ std::vector<instruction_ref> mlir_contiguous(module_pass_manager& mpm,
     return result;
 }
 
+struct find_mlir_split_reduce
+{
+    mlir_mode conv_mode = mlir_mode::none;
+    mlir_mode dot_mode  = mlir_mode::none;
+    auto matcher() const
+    {
+        auto dot_or_conv = match::name("gpu::mlir_op");
+        // TODO: Handle reshapes inbetween
+        return mlir_split_reduce()(match::any_of[match::inputs()](dot_or_conv.bind("gemm")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce_ins = r.result;
+        auto gemm_ins   = r.instructions["gemm"];
+        assert(gemm_ins->get_shape().sub_shapes().empty());
+        auto* rm   = reduce_ins->module_inputs().front();
+        auto names = rm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        module_ref gemm_old_mm = gemm_ins->module_inputs().front();
+        module_ref mm = mpm.create_module(gemm_old_mm->name() + "_" + rm->name(), *gemm_old_mm);
+        // remove last return instruction
+        if(std::prev(mm->end())->name() == "@return")
+        {
+            mm->remove_instruction(std::prev(mm->end()));
+        }
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        param_map[gemm_ins]      = std::prev(mm->end());
+        bool gemm_has_multi_outs = gemm_ins->outputs().size() > 1;
+        auto return_vals =
+            mm->fuse(*rm,
+                     reduce_ins->inputs(),
+                     &param_map,
+                     [&](module& main_mod,
+                         instruction_ref pos,
+                         const operation& op,
+                         const std::vector<instruction_ref>& inputs,
+                         const std::vector<module_ref>& mod_args) {
+                         if(op.name() == "pointwise")
+                         {
+                             auto* sub_pm     = mod_args.front();
+                             auto param_map_2 = create_param_map_with_literals(
+                                 &main_mod, sub_pm, op.compute_shape(to_shapes(inputs), mod_args));
+                             return main_mod.insert_inline(pos, *sub_pm, inputs, &param_map_2)
+                                 .front(); // cppcheck-suppress returnDanglingLifetime;
+                         }
+                         return main_mod.insert_instruction(pos, op, inputs, mod_args);
+                     });
+        if(gemm_has_multi_outs)
+        {
+            return_vals.insert(return_vals.end(), param_map[gemm_ins]);
+        }
+        mm->add_return(return_vals);
+        std::vector<instruction_ref> inputs;
+        std::copy_if(reduce_ins->inputs().begin(),
+                     reduce_ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != gemm_ins; });
+        inputs.insert(inputs.end(), gemm_ins->inputs().begin(), gemm_ins->inputs().end());
+        if(gemm_has_multi_outs)
+        {
+            auto fused_ins = mpm.get_module().insert_instruction(
+                reduce_ins, mlir_op{gemm_ins->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+            auto dot_ins = mpm.get_module().insert_instruction(
+                reduce_ins,
+                migraphx::make_op("get_tuple_elem", {{"index", return_vals.size() - 1}}),
+                fused_ins);
+
+            mpm.get_module().replace_instruction(gemm_ins, dot_ins);
+            for(const auto outs : reduce_ins->outputs())
+            {
+                assert(outs->get_operator().name() == "get_tuple_elem");
+                mpm.get_module().replace_instruction(outs, outs->get_operator(), fused_ins);
+            }
+        }
+        else
+        {
+            mpm.get_module().replace_instruction(
+                reduce_ins, mlir_op{gemm_ins->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+        }
+    }
+};
+
 struct find_mlir_fused_ops
 {
     mlir_mode conv_mode = mlir_mode::none;
@@ -450,7 +581,7 @@ struct find_mlir_fused_ops
                return i != x_ins and reaches(gemm_based_op, i);
            }))
             return;
-        auto names         = pm->get_parameter_names();
+        auto names = pm->get_parameter_names();
         std::sort(names.begin(), names.end());
         module_ref mm = mpm.create_module("mlir_" + pm->name());
         mm->set_bypass();
@@ -608,7 +739,7 @@ struct find_mlir_standalone_attention_op
         auto gemm1 = mm->add_instruction(make_op("dot"), {softmax, new_upper_v});
 
         std::vector<instruction_ref> ins_to_replace = {gemm1};
-        auto ins_to_be_replaced    = gemm_softmax_gemm;
+        auto ins_to_be_replaced                     = gemm_softmax_gemm;
         if(r.instructions.find("trailing_pm") != r.instructions.end())
         {
             auto trailing_pm_ins = r.instructions["trailing_pm"];
@@ -714,15 +845,25 @@ void fuse_mlir::apply(module_pass_manager& mpm) const
         mpm,
         find_mlir_fused_ops{.conv_mode = get_mode("fused_convolution", mlir_mode::fast),
                             .dot_mode  = get_mode("fused_dot", mlir_mode::fast)});
+
     match::find_matches(
         mpm,
         find_mlir_standalone_convolution_op{get_mode("convolution", mlir_mode::fast)},
         find_mlir_standalone_dot_op{get_mode("dot", mlir_mode::fast)});
 
     mpm.run_pass(dead_code_elimination{});
+    if(enabled(MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION{}))
+    {
+        match::find_matches(
+            mpm,
+            find_mlir_split_reduce{.conv_mode = get_mode("fused_convolution", mlir_mode::fast),
+                                   .dot_mode  = get_mode("fused_dot", mlir_mode::fast)});
+    }
 
     if(enabled(MIGRAPHX_ENABLE_MLIR_INPUT_FUSION{}))
+    {
         match::find_matches(mpm, find_pointwise_mlir{});
+    }
 #else
     (void)mpm;
 #endif

@@ -37,8 +37,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct module;
 namespace gpu {
 
-MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m);
-MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m, const std::vector<shape>& inputs);
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(module m);
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(module m, const std::vector<shape>& inputs);
 
 MIGRAPHX_GPU_EXPORT bool
 is_module_fusible(const module& m, const context& migraphx_ctx, const value& solution);
@@ -50,6 +50,8 @@ struct MIGRAPHX_GPU_EXPORT mlir_code_object
     std::vector<value> prefill_values   = {};
 };
 
+MIGRAPHX_GPU_EXPORT bool is_reduce(const instruction& ins);
+
 MIGRAPHX_GPU_EXPORT mlir_code_object compile_mlir(const context& migraphx_ctx,
                                                   module m,
                                                   const std::vector<shape>& in_shapes,

@@ -99,7 +99,14 @@ struct mlir_compiler : compiler<mlir_compiler>
             dot_mlir_inputs.push_back(mod_splits[0].mod.get_output_shapes().front());
             mlir_code_object cop1 = compile_mlir(ctx, mod_splits[0].mod, dot_mlir_inputs, solution);
             auto pw_shapes        = to_shapes(mod_splits[1].inputs);
-            pw_shapes.push_back(mod_splits[1].mod.get_output_shapes().front());
+            if(mod_splits[1].mod.get_output_shapes().size() == 1)
+            {
+                pw_shapes.push_back(mod_splits[1].mod.get_output_shapes().front());
+            }
+            else
+            {
+                pw_shapes.push_back(shape{mod_splits[1].mod.get_output_shapes()});
+            }
             assert(pw_shapes.back() == ins->get_shape());
             auto pw_mod                        = create_pointwise_module(&mod_splits[1].mod);
             auto cop2                          = compile_pointwise(ctx, pw_shapes, &pw_mod);

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal