From 0c36373fea57f8bc79776f3f586b7b022eaad478 Mon Sep 17 00:00:00 2001
From: Steve Yoo <steve.yoo@intel.com>
Date: Thu, 12 Sep 2024 22:13:42 +0000
Subject: [PATCH] [GPU] Fix to enable fc 5d

---
 .../intel_gpu/src/graph/fully_connected.cpp   |  62 ++++++++---
 .../test_cases/fully_connected_gpu_test.cpp   | 105 ++++++++++++++++--
 2 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp
index 7d0fe9c06096cd..e04541683a0c93 100644
--- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp
@@ -64,18 +64,18 @@ format::type get_preferred_format(fully_connected_node const& node, const kernel
     }
 
     if (input_layout.data_type == data_types::f32 &&
-        input_layout.format == format::bfyx &&
+        (input_layout.format == format::bfyx || input_layout.format == format::bfzyx) &&
         no_spatial_padding &&
         input_layout.batch() != 8)
-        return format::bfyx;
+        return input_layout.format;
 
     auto input_pitches = input_layout.get_pitches();
     if (input_layout.data_type == data_types::f16 &&
-        input_layout.format == format::bfyx &&
+        (input_layout.format == format::bfyx || input_layout.format == format::bfzyx) &&
         no_spatial_padding &&
         input_pitches[0] % 2 == 0 &&
         input_layout.batch() != 16)
-        return format::bfyx;
+        return input_layout.format;
 
     // this condition tests whether our input is batch>1 in bfyx format, if yes there will be
     // extra reorder between input and this fc from bfyx to yxfb format (so
@@ -115,23 +115,49 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node
     };
 
     int64_t feature = input_pshape[std::min(desc->input_size, static_cast<size_t>(4)) - 1].get_length();
-    if (desc->input_size == 3) {
-        feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)});
-    }
+    auto output_size = tensor();
 
-    if ((supports_immad && desc->input_size > 3) || desc->input_size > 4) {
-       input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature));
-    }
-    if (weights_pshape.size() != 2) {
-        weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature));
-    }
+    // If immad is supported, spatial dimensions are reshaped to 2d in order to select oneDnn impl,
+    // because oneDnn doesn't support spatial dimensions for output.
+    if (supports_immad) {
+        if (desc->input_size == 3) {
+            feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)});
+        }
 
-    auto output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1);
-    if (desc->input_size == 3) {
-        output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch());
-    } else if (!supports_immad && desc->input_size == 4) {
-        output_size = tensor(input_layout.batch(), input_layout.feature(), weights_layout.batch(), input_layout.spatial(1));
+        if (desc->input_size > 3) {
+            input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature));
+        }
+        if (weights_pshape.size() != 2) {
+            weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature));
+        }
+
+        output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1);
+        if (desc->input_size == 3) {
+            output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch());
+        }
+    } else {
+        feature = input_pshape[std::min(desc->input_size, static_cast<size_t>(5)) - 1].get_length();
+        if (desc->input_size == 3) {
+            feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)});
+        }
+
+        if (desc->input_size > 5) {
+            input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature));
+        }
+        if (weights_pshape.size() != 2) {
+            weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature));
+        }
+
+        output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1);
+        if (desc->input_size == 3) {
+            output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch());
+        } else if (desc->input_size == 4) {
+            output_size = tensor(input_layout.batch(), input_layout.feature(), weights_layout.batch(), input_layout.spatial(1));
+        } else if (desc->input_size == 5) {
+            output_size = tensor(input_layout.batch(), input_layout.feature(), weights_layout.batch(), input_layout.spatial(1), input_layout.spatial(2));
+        }
     }
+
     format output_format = get_preferred_format(node, impl_param);
 
     return layout(output_type, output_format, output_size);
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index 8483cbdfbf36e2..5ed5af982d7861 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -376,12 +376,12 @@ TEST(fully_connected_gpu, no_biases_4d_input) {
     if (engine.get_device_info().supports_immad)
         return;
 
-    //  Input  : 1x256x256x384
-    //  Output : 1x256x256x1536
-    //  Weights: 1536x384x1x1
+    //  Input  : 1x8x8x12
+    //  Weights: 48x12x1x1
+    //  Output : 1x8x8x48
 
-    const int32_t input_b = 1, input_f = 256, input_y = 256, input_x = 384,     // size of the whole input buffer
-                  weight_b = 1536, weight_f = 384, weight_y = 1, weight_x = 1;  // size of the whole weights buffer
+    const int32_t input_b = 1, input_f = 8, input_y = 8, input_x = 12,          // size of the whole input buffer
+                  weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1;     // size of the whole weights buffer
 
     auto input_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { input_b, input_f, input_x, input_y } });
     auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } });
@@ -415,12 +415,12 @@ TEST(fully_connected_gpu, no_biases_4d_input_immad) {
     if (!engine.get_device_info().supports_immad)
         return;
 
-    //  Input  : 1x256x256x384
-    //  Output : 65536x1536x1x1
-    //  Weights: 1536x384x1x1
+    //  Input  : 1x8x8x12
+    //  Weights: 48x12x1x1
+    //  Output : 64x48x1x1
 
-    const int32_t input_b = 1, input_f = 256, input_y = 256, input_x = 384,     // size of the whole input buffer
-                  weight_b = 1536, weight_f = 384, weight_y = 1, weight_x = 1;  // size of the whole weights buffer
+    const int32_t input_b = 1, input_f = 8, input_y = 8, input_x = 12,          // size of the whole input buffer
+                  weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1;     // size of the whole weights buffer
 
     auto input_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { input_b, input_f, input_x, input_y } });
     auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } });
@@ -455,6 +455,91 @@ TEST(fully_connected_gpu, no_biases_4d_input_immad) {
     ASSERT_EQ(outputs.begin()->second.get_layout().spatial(0), weight_x);
 }
 
+TEST(fully_connected_gpu, no_biases_5d_input) {
+    auto& engine = get_test_engine();
+    if (engine.get_device_info().supports_immad)
+        return;
+
+    //  Input  : 1x8x8x8x12
+    //  Weights: 48x12x1x1
+    //  Output : 1x8x8x8x48
+
+    const int32_t input_b = 1, input_f = 8, input_z = 8, input_y = 8, input_x = 12, // size of the whole input buffer
+                  weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1;         // size of the whole weights buffer
+
+    auto input_prim = engine.allocate_memory({ data_types::f32, format::bfzyx, { input_b, input_f, input_x, input_y, input_z } });
+    auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } });
+
+    std::vector<float> input_data(input_b * input_f * input_z * input_y * input_x, 0);
+    std::vector<float> weights_data(weight_b * weight_f * weight_y * weight_x, 0);
+
+    set_values(input_prim, std::move(input_data));
+    set_values(weights_prim, std::move(weights_data));
+
+    auto input = input_layout("input", input_prim->get_layout());
+    auto w_data = data("weights", weights_prim);
+    auto fc = fully_connected("fc_prim", input_info("input"), "weights", "", 5, 2);
+    topology topology;
+    topology.add(input);
+    topology.add(w_data);
+    topology.add(fc);
+
+    network network(engine, topology, get_test_default_config(engine));
+    network.set_input_data("input", input_prim);
+
+    auto outputs = network.execute();
+    ASSERT_EQ(outputs.begin()->second.get_layout().batch(), input_b);
+    ASSERT_EQ(outputs.begin()->second.get_layout().feature(), input_f);
+    ASSERT_EQ(outputs.begin()->second.get_layout().spatial(2), input_z);
+    ASSERT_EQ(outputs.begin()->second.get_layout().spatial(1), input_y);
+    ASSERT_EQ(outputs.begin()->second.get_layout().spatial(0), weight_b);
+}
+
+TEST(fully_connected_gpu, no_biases_5d_input_immad) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    //  Input  : 1x8x8x8x12
+    //  Weights: 48x12x1x1
+    //  Output : 512x48x1x1
+
+    const int32_t input_b = 1, input_f = 8, input_z = 8, input_y = 8, input_x = 12, // size of the whole input buffer
+                  weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1;         // size of the whole weights buffer
+
+    auto input_prim = engine.allocate_memory({ data_types::f32, format::bfzyx, { input_b, input_f, input_x, input_y, input_z } });
+    auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } });
+
+    std::vector<float> input_data(input_b * input_f * input_z * input_y * input_x, 0);
+    std::vector<float> weights_data(weight_b * weight_f * weight_y * weight_x, 0);
+
+    set_values(input_prim, std::move(input_data));
+    set_values(weights_prim, std::move(weights_data));
+
+    auto input = input_layout("input", input_prim->get_layout());
+    auto w_data = data("weights", weights_prim);
+    auto fc = fully_connected("fc_prim", input_info("input"), "weights", "", 5, 2);
+    topology topology;
+    topology.add(input);
+    topology.add(w_data);
+    topology.add(fc);
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+    network.set_input_data("input", input_prim);
+
+    auto fc_impl = network.get_primitive("fc_prim")->get_impl();
+    ASSERT_TRUE(fc_impl != nullptr);
+    ASSERT_TRUE(fc_impl->is_onednn());
+
+    auto outputs = network.execute();
+    ASSERT_EQ(outputs.begin()->second.get_layout().batch(), input_f*input_z*input_y);
+    ASSERT_EQ(outputs.begin()->second.get_layout().feature(), weight_b);
+    ASSERT_EQ(outputs.begin()->second.get_layout().spatial(1), weight_y);
+    ASSERT_EQ(outputs.begin()->second.get_layout().spatial(0), weight_x);
+}
+
 TEST(fully_connected_gpu, xb_f32_batch_1) {
     //  Input  : 3x1
     //  Output : 4x1