From 0c36373fea57f8bc79776f3f586b7b022eaad478 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Thu, 12 Sep 2024 22:13:42 +0000 Subject: [PATCH] [GPU] Fix to enable fc 5d --- .../intel_gpu/src/graph/fully_connected.cpp | 62 ++++++++--- .../test_cases/fully_connected_gpu_test.cpp | 105 ++++++++++++++++-- 2 files changed, 139 insertions(+), 28 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 7d0fe9c06096cd..e04541683a0c93 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -64,18 +64,18 @@ format::type get_preferred_format(fully_connected_node const& node, const kernel } if (input_layout.data_type == data_types::f32 && - input_layout.format == format::bfyx && + (input_layout.format == format::bfyx || input_layout.format == format::bfzyx) && no_spatial_padding && input_layout.batch() != 8) - return format::bfyx; + return input_layout.format; auto input_pitches = input_layout.get_pitches(); if (input_layout.data_type == data_types::f16 && - input_layout.format == format::bfyx && + (input_layout.format == format::bfyx || input_layout.format == format::bfzyx) && no_spatial_padding && input_pitches[0] % 2 == 0 && input_layout.batch() != 16) - return format::bfyx; + return input_layout.format; // this condition tests whether our input is batch>1 in bfyx format, if yes there will be // extra reorder between input and this fc from bfyx to yxfb format (so @@ -115,23 +115,49 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node }; int64_t feature = input_pshape[std::min(desc->input_size, static_cast(4)) - 1].get_length(); - if (desc->input_size == 3) { - feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)}); - } + auto output_size = tensor(); - if ((supports_immad && desc->input_size > 3) || desc->input_size > 4) { - input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature)); - } - if (weights_pshape.size() != 2) { - weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature)); - } + // If immad is supported, spatial dimensions are reshaped to 2d in order to select oneDnn impl, + // because oneDnn doesn't support spatial dimensions for output. + if (supports_immad) { + if (desc->input_size == 3) { + feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)}); + } - auto output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1); - if (desc->input_size == 3) { - output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch()); - } else if (!supports_immad && desc->input_size == 4) { - output_size = tensor(input_layout.batch(), input_layout.feature(), weights_layout.batch(), input_layout.spatial(1)); + if (desc->input_size > 3) { + input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature)); + } + if (weights_pshape.size() != 2) { + weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature)); + } + + output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1); + if (desc->input_size == 3) { + output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch()); + } + } else { + feature = input_pshape[std::min(desc->input_size, static_cast(5)) - 1].get_length(); + if (desc->input_size == 3) { + feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)}); + } + + if (desc->input_size > 5) { + input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature)); + } + if (weights_pshape.size() != 2) { + weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature)); + } + + output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1); + if (desc->input_size == 3) { + output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch()); + } else if (desc->input_size == 4) { + output_size = tensor(input_layout.batch(), input_layout.feature(), weights_layout.batch(), input_layout.spatial(1)); + } else if (desc->input_size == 5) { + output_size = tensor(input_layout.batch(), input_layout.feature(), weights_layout.batch(), input_layout.spatial(1), input_layout.spatial(2)); + } } + format output_format = get_preferred_format(node, impl_param); return layout(output_type, output_format, output_size); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 8483cbdfbf36e2..5ed5af982d7861 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -376,12 +376,12 @@ TEST(fully_connected_gpu, no_biases_4d_input) { if (engine.get_device_info().supports_immad) return; - // Input : 1x256x256x384 - // Output : 1x256x256x1536 - // Weights: 1536x384x1x1 + // Input : 1x8x8x12 + // Weights: 48x12x1x1 + // Output : 1x8x8x48 - const int32_t input_b = 1, input_f = 256, input_y = 256, input_x = 384, // size of the whole input buffer - weight_b = 1536, weight_f = 384, weight_y = 1, weight_x = 1; // size of the whole weights buffer + const int32_t input_b = 1, input_f = 8, input_y = 8, input_x = 12, // size of the whole input buffer + weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1; // size of the whole weights buffer auto input_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { input_b, input_f, input_x, input_y } }); auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } }); @@ -415,12 +415,12 @@ TEST(fully_connected_gpu, no_biases_4d_input_immad) { if (!engine.get_device_info().supports_immad) return; - // Input : 1x256x256x384 - // Output : 65536x1536x1x1 - // Weights: 1536x384x1x1 + // Input : 1x8x8x12 + // Weights: 48x12x1x1 + // Output : 64x48x1x1 - const int32_t input_b = 1, input_f = 256, input_y = 256, input_x = 384, // size of the whole input buffer - weight_b = 1536, weight_f = 384, weight_y = 1, weight_x = 1; // size of the whole weights buffer + const int32_t input_b = 1, input_f = 8, input_y = 8, input_x = 12, // size of the whole input buffer + weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1; // size of the whole weights buffer auto input_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { input_b, input_f, input_x, input_y } }); auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } }); @@ -455,6 +455,91 @@ TEST(fully_connected_gpu, no_biases_4d_input_immad) { ASSERT_EQ(outputs.begin()->second.get_layout().spatial(0), weight_x); } +TEST(fully_connected_gpu, no_biases_5d_input) { + auto& engine = get_test_engine(); + if (engine.get_device_info().supports_immad) + return; + + // Input : 1x8x8x8x12 + // Weights: 48x12x1x1 + // Output : 1x8x8x8x48 + + const int32_t input_b = 1, input_f = 8, input_z = 8, input_y = 8, input_x = 12, // size of the whole input buffer + weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1; // size of the whole weights buffer + + auto input_prim = engine.allocate_memory({ data_types::f32, format::bfzyx, { input_b, input_f, input_x, input_y, input_z } }); + auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } }); + + std::vector input_data(input_b * input_f * input_z * input_y * input_x, 0); + std::vector weights_data(weight_b * weight_f * weight_y * weight_x, 0); + + set_values(input_prim, std::move(input_data)); + set_values(weights_prim, std::move(weights_data)); + + auto input = input_layout("input", input_prim->get_layout()); + auto w_data = data("weights", weights_prim); + auto fc = fully_connected("fc_prim", input_info("input"), "weights", "", 5, 2); + topology topology; + topology.add(input); + topology.add(w_data); + topology.add(fc); + + network network(engine, topology, get_test_default_config(engine)); + network.set_input_data("input", input_prim); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.begin()->second.get_layout().batch(), input_b); + ASSERT_EQ(outputs.begin()->second.get_layout().feature(), input_f); + ASSERT_EQ(outputs.begin()->second.get_layout().spatial(2), input_z); + ASSERT_EQ(outputs.begin()->second.get_layout().spatial(1), input_y); + ASSERT_EQ(outputs.begin()->second.get_layout().spatial(0), weight_b); +} + +TEST(fully_connected_gpu, no_biases_5d_input_immad) { + auto& engine = get_test_engine(); + if (!engine.get_device_info().supports_immad) + return; + + // Input : 1x8x8x8x12 + // Weights: 48x12x1x1 + // Output : 512x48x1x1 + + const int32_t input_b = 1, input_f = 8, input_z = 8, input_y = 8, input_x = 12, // size of the whole input buffer + weight_b = 48, weight_f = 12, weight_y = 1, weight_x = 1; // size of the whole weights buffer + + auto input_prim = engine.allocate_memory({ data_types::f32, format::bfzyx, { input_b, input_f, input_x, input_y, input_z } }); + auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { weight_b, weight_f, weight_x, weight_y } }); + + std::vector input_data(input_b * input_f * input_z * input_y * input_x, 0); + std::vector weights_data(weight_b * weight_f * weight_y * weight_x, 0); + + set_values(input_prim, std::move(input_data)); + set_values(weights_prim, std::move(weights_data)); + + auto input = input_layout("input", input_prim->get_layout()); + auto w_data = data("weights", weights_prim); + auto fc = fully_connected("fc_prim", input_info("input"), "weights", "", 5, 2); + topology topology; + topology.add(input); + topology.add(w_data); + topology.add(fc); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input_prim); + + auto fc_impl = network.get_primitive("fc_prim")->get_impl(); + ASSERT_TRUE(fc_impl != nullptr); + ASSERT_TRUE(fc_impl->is_onednn()); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.begin()->second.get_layout().batch(), input_f*input_z*input_y); + ASSERT_EQ(outputs.begin()->second.get_layout().feature(), weight_b); + ASSERT_EQ(outputs.begin()->second.get_layout().spatial(1), weight_y); + ASSERT_EQ(outputs.begin()->second.get_layout().spatial(0), weight_x); +} + TEST(fully_connected_gpu, xb_f32_batch_1) { // Input : 3x1 // Output : 4x1