From bd44caacf15a3b6b059af77e3345f79606067fea Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 15 Mar 2022 13:45:15 +0000 Subject: [Dynamic Fusion] Implement build options generation Resolves: COMPMID-5153 Signed-off-by: Giorgio Arena Change-Id: Ic34cc1f0d092fafa7c2faa4dd705cf8f68eaf87e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7317 Comments-Addressed: Arm Jenkins Reviewed-by: SiCong Li Tested-by: Arm Jenkins --- .../dynamic_fusion/ClKernelBuildingImpl/Common.h | 66 ++++-- .../components/ClElementwiseAddKernelComponent.cpp | 16 ++ .../components/ClElementwiseAddKernelComponent.h | 1 + .../components/ClGemmNativeKernelComponent.cpp | 225 +++++++++++++++------ .../components/ClGemmNativeKernelComponent.h | 1 + .../components/ClStoreKernelComponents.cpp | 17 ++ .../components/ClStoreKernelComponents.h | 5 +- .../CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | 70 +------ 8 files changed, 248 insertions(+), 153 deletions(-) diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h index b285cc2b54..6e1291cdd5 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h @@ -30,6 +30,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/GPUTarget.h" #include "src/core/common/Macros.h" +#include "support/StringSupport.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" @@ -191,7 +192,7 @@ public: struct TagVal { TagVal() = default; - TagVal(SharedVarTable::SharedVar var) + TagVal(const SharedVarTable::SharedVar &var) : value{ var.uniq_name } { } @@ -201,6 +202,11 @@ public: { } + TagVal(const std::string &val) + : value{ val } + { + } + std::string value{}; }; using TagLUT = std::unordered_map; // Used to instantiating a code template / replacing tags @@ -217,12 +223,12 @@ public: virtual std::vector get_links() const = 0; virtual std::string name() const = 0; + // @note: some tags can be unused since they could be used only for the macros, or only for the component code static std::string replace_tags(const std::string &code_template, const TagLUT &tags) { - std::string replaced_code = ""; - std::unordered_set used_tags{}; - bool scanning_pattern = false; - std::string pattern_found = ""; + std::string replaced_code = ""; + bool scanning_pattern = false; + std::string pattern_found = ""; for(size_t i = 0; i < code_template.size() - 1; ++i) { if(!scanning_pattern) @@ -247,7 +253,6 @@ public: std::string err = "Pattern " + pattern_found + " not found in tags"; ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str()); replaced_code += tags.find(pattern_found)->second.value; - used_tags.insert(pattern_found); } else { @@ -255,12 +260,7 @@ public: } } } - // Check for unused tags - for(const auto &tag : tags) - { - ARM_COMPUTE_UNUSED(tag); - ARM_COMPUTE_ERROR_ON_MSG(used_tags.find(tag.first) == used_tags.end(), "Warning: unused tags"); - } + return replaced_code; } ComponentID id() const @@ -303,6 +303,11 @@ public: return ""; } + virtual CLBuildOptions generate_build_options() const + { + return CLBuildOptions{}; + } + protected: const ClKernelBlueprint *_blueprint; @@ -445,12 +450,10 @@ public: { std::string name = ""; - auto stack = topological_sort(); - while(!stack.empty()) + traverse([&](std::stack stack) { name += _components.find(stack.top())->second->name() + (stack.size() > 2 ? "___" : ""); - stack.pop(); - } + }); return name; } @@ -480,7 +483,7 @@ public: headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); if(!curr_additional_macros.empty()) // Some components might not have any { - additional_macros.insert(curr_additional_macros); + additional_macros.insert(IClKernelComponent::replace_tags(curr_additional_macros, var_lut)); } stack.pop(); @@ -524,7 +527,19 @@ public: CLBuildOptions build_options() const { - return CLBuildOptions{}; + CLBuildOptions build_opts{}; + + traverse([&](std::stack stack) + { + build_opts.add_options(_components.find(stack.top())->second->generate_build_options().options()); + }); + + return build_opts; + } + + TileDescriptor get_tile_info() const + { + return _tile_info; } Window get_execution_window() const @@ -596,6 +611,17 @@ private: return stack; } + void traverse(const std::function)> &func) const + { + std::stack stack = topological_sort(); + + while(!stack.empty()) + { + func(stack); + stack.pop(); + } + } + std::string generate_argument_declaration(const SharedVarTable::SharedVar &var) const { ARM_COMPUTE_ERROR_ON_MSG(var.group != SharedVarGroup::Argument, "An argument declaration can only be generated from a kernel argument"); @@ -672,7 +698,7 @@ private: ARM_COMPUTE_ERROR("Unsupported clipping strategy"); } - code += "\n REPEAT_VAR_INIT_TO_CONST(M0, uint, g_zout, 0);\n"; + code += "\n REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n"; code += " REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n"; return code; @@ -684,7 +710,7 @@ private: int32_t _num_components{}; int32_t _num_complex_components{}; - ArgumentID _dst_id{ -1 }; + ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate) std::unordered_map _components{}; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp index 06c29c4253..bbdf8df0a3 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp @@ -82,6 +82,22 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const )_"; } + +CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const +{ + auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + auto tile_info = _blueprint->impl().get_tile_info(); + + CLBuildOptions build_opts{}; + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); + build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); + + return build_opts; +} + ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const { // Determine which argument is the accumulator diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h index fe5f964c54..c259811a98 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h @@ -46,6 +46,7 @@ public: std::set get_headers_list() const override; std::string get_component_code() const override; Window get_window() const override; + CLBuildOptions generate_build_options() const override; virtual std::vector get_links() const override { diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp index e70e5d5ea5..4bf0b76c3a 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp @@ -28,6 +28,9 @@ #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/StringSupport.h" + namespace arm_compute { namespace experimental @@ -214,6 +217,13 @@ std::string ClGemmNativeKernelComponent::get_additional_macros() const std::string ClGemmNativeKernelComponent::get_component_code() const { + auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); + auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); + + auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); + auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; + auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); + std::string code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- // IN_0(lhs) {{lhs}} @@ -245,34 +255,49 @@ std::string ClGemmNativeKernelComponent::get_component_code() const // Compute RHS matrix address uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE); + )_"; -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (g_z % MATRIX_B_DEPTH) * {{rhs}}_stride_z; -#else // defined(MATRIX_B_DEPTH) - rhs_offset += g_z * {{rhs}}_stride_z; -#endif // defined(MATRIX_B_DEPTH) + if(dont_slide_b) + { + code += R"_( + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z; + )_"; + } + else + { + code += R"_( + rhs_offset += g_z * {{rhs}}_stride_z; + )_"; + } + code += R"_( REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); + )_"; -#if defined(REINTERPRET_INPUT_AS_3D) - // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += g_z * {{lhs}}_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += g_z * {{lhs}}_stride_z; + if(reinterpret_input_as_3d) + { + code += R"_( + // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y); -#endif // defined(REINTERPRET_INPUT_AS_3D) + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}}; + )_"; + } + else + { + code += R"_( + // Add offset for batched GEMM + lhs_offset += g_z * {{lhs}}_stride_z; + )_"; + } + code += R"_( int i = 0; -#if K0 > 1 - for(; i <= (K - K0); i += K0) +#if {{K0}} > 1 + for(; i <= (K - {{K0}}); i += {{K0}}) { // Supported cases (M0, K0): // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 @@ -284,26 +309,26 @@ std::string ClGemmNativeKernelComponent::get_component_code() const // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs); + LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs); // Load values from RHS matrix - LOAD_BLOCK(K0, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero); + LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero); RHS_VFMA_M0xN0(0, a, b0, {{dst}}); RHS_VFMA_M0xN0(1, a, b1, {{dst}}); -#if K0 > 2 +#if {{K0}} > 2 RHS_VFMA_M0xN0(2, a, b2, {{dst}}); #endif // K0 > 2 -#if K0 > 3 +#if {{K0}} > 3 RHS_VFMA_M0xN0(3, a, b3, {{dst}}); #endif // K0 > 3 -#if K0 > 4 +#if {{K0}} > 4 RHS_VFMA_M0xN0(4, a, b4, {{dst}}); RHS_VFMA_M0xN0(5, a, b5, {{dst}}); RHS_VFMA_M0xN0(6, a, b6, {{dst}}); RHS_VFMA_M0xN0(7, a, b7, {{dst}}); #endif // K0 > 4 -#if K0 > 8 +#if {{K0}} > 8 RHS_VFMA_M0xN0(8, a, b8, {{dst}}); RHS_VFMA_M0xN0(9, a, b9, {{dst}}); RHS_VFMA_M0xN0(A, a, bA, {{dst}}); @@ -314,8 +339,8 @@ std::string ClGemmNativeKernelComponent::get_component_code() const RHS_VFMA_M0xN0(F, a, bF, {{dst}}); #endif // K0 > 8 - lhs_offset += K0 * sizeof(DATA_TYPE); - rhs_offset += K0 * {{rhs}}_stride_y; + lhs_offset += {{K0}} * sizeof(DATA_TYPE); + rhs_offset += {{K0}} * {{rhs}}_stride_y; } #endif // K0 > 1 // Left-over accumulations @@ -362,44 +387,61 @@ std::string ClGemmNativeKernelComponent::get_component_code() const } // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, ALPHA); -#endif // defined(ALPHA) )_"; - - if(!_bias.is_empty()) + if(has_alpha) { code += R"_( - // Add beta*bias -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, - PARTIAL_STORE_M0) - * {{bias}}_stride_y) - + g_z * {{bias}}_stride_z; - - LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, {{dst}}, bias); + SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}}); + )_"; + } -#endif // defined(BROADCAST_BIAS) - )_"; + if(!_bias.is_empty()) + { + if(_desc.broadcast_bias) + { + code += R"_( + // Add beta*bias + __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); + )_"; + + if(helpers::float_ops::is_one(_desc.beta)) + { + code += R"_( + SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}}); + )_"; + } + + code += R"_( + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0); + )_"; + } + else + { + code += R"_( + // Add beta*bias + __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, + PARTIAL_STORE_M0) + * {{bias}}_stride_y) + + g_z * {{bias}}_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); + )_"; + + if(helpers::float_ops::is_one(_desc.beta)) + { + code += R"_( + SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}}); + )_"; + } + + code += R"_( + // c = c + bias + ADD_BLOCK(M0, {{dst}}, bias); + )_"; + } } code += R"_( @@ -409,6 +451,25 @@ std::string ClGemmNativeKernelComponent::get_component_code() const return code.c_str(); } +CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const +{ + auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + auto tile_info = _blueprint->impl().get_tile_info(); + + CLBuildOptions build_opts{}; + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); + build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y())); + build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x())); + build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); + build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); + + return build_opts; +} + ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const { TagLUT lut{}; @@ -421,6 +482,44 @@ ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(S lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias"); } lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst"); + + // Local build options + auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); + auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); + auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); + auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr; + auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; + auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0; + auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); + + lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0); + + if(has_alpha) + { + lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha); + } + if(has_beta) + { + lut["BETA"] = float_to_string_with_full_precision(_desc.beta); + } + if(dont_slide_b) + { + lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2)); + } + + if(reinterpret_output_as_3d) + { + lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1)); + lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2)); + } + else if(reinterpret_input_as_3d) + { + lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1)); + lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2)); + } + return lut; } } // namespace dynamic_fusion diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h index 09933a8932..1a1e3e3ce6 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h @@ -52,6 +52,7 @@ public: std::string get_component_code() const override; Window get_window() const override; ClKernelArgList get_args(); + CLBuildOptions generate_build_options() const override; virtual std::vector get_links() const override { diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp index 430fafb89f..2d7b46616f 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp @@ -61,6 +61,23 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const )_"; } + +CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const +{ + auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + auto tile_info = _blueprint->impl().get_tile_info(); + + CLBuildOptions build_opts{}; + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); + build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); + + return build_opts; +} + ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const { return { diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h index ad7a207ef8..8d58da2a0d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h @@ -41,8 +41,9 @@ public: : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } { } - ComponentType get_component_type() const override; - std::string get_component_code() const override; + ComponentType get_component_type() const override; + std::string get_component_code() const override; + CLBuildOptions generate_build_options() const override; virtual std::vector get_links() const override { diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp index 753e0a4625..cb365901da 100644 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp @@ -75,70 +75,6 @@ void fill(U &&tensor, int seed) DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; library->fill_borders_with_garbage(tensor, distribution_inf, seed); } - -void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc, - const TensorInfo &t_lhs_info, - const TensorInfo &t_rhs_info, - const TensorInfo *t_bias_info, - const TensorInfo &t_dst_info) -{ - CLBuildOptions ref_cl_build_options; - { - // If reinterpret_input_as_3d = reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m - auto reinterpret_input_as_3d = gemm_native_desc.reinterpret_input_as_3d; - auto reinterpret_output_as_3d = gemm_native_desc.depth_output_gemm3d != 0; - auto _slide_matrix_b = (t_rhs_info.num_dimensions() >= t_lhs_info.num_dimensions()); - auto _use_dummy_work_items = false; - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_input_as_3d = false; - reinterpret_output_as_3d = false; - } - - const unsigned int internal_m = reinterpret_output_as_3d ? gemm_native_desc.m : t_dst_info.dimension(1); - - const unsigned int h_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(1) : t_lhs_info.dimension(1); - const unsigned int d_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(2) : t_lhs_info.dimension(2); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % gemm_native_desc.lhs_info.m0; - const unsigned int partial_store_n0 = gemm_native_desc.n % gemm_native_desc.rhs_info.n0; - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - const unsigned int internal_m0 = std::min(internal_m, gemm_native_desc.lhs_info.m0); - - ref_cl_build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info.data_type())); - ref_cl_build_options.add_option_if(!(helpers::float_ops::is_one(gemm_native_desc.alpha)), "-DALPHA=" + float_to_string_with_full_precision(gemm_native_desc.alpha)); - ref_cl_build_options.add_option_if(t_bias_info != nullptr, "-DBETA=" + float_to_string_with_full_precision(gemm_native_desc.beta)); - ref_cl_build_options.add_option_if(helpers::float_ops::is_one(gemm_native_desc.beta), "-DUNIT_BETA"); - ref_cl_build_options.add_option_if(gemm_native_desc.broadcast_bias, "-DBROADCAST_BIAS"); - ref_cl_build_options.add_option_if(reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - ref_cl_build_options.add_option_if(reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - ref_cl_build_options.add_option_if(reinterpret_input_as_3d || reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - ref_cl_build_options.add_option_if(reinterpret_input_as_3d || reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); - ref_cl_build_options.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(t_rhs_info.dimension(2))); - ref_cl_build_options.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - ref_cl_build_options.add_option("-DM=" + support::cpp11::to_string(internal_m)); - ref_cl_build_options.add_option("-DN=" + support::cpp11::to_string(gemm_native_desc.n)); - ref_cl_build_options.add_option("-DK=" + support::cpp11::to_string(gemm_native_desc.k)); - ref_cl_build_options.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - ref_cl_build_options.add_option("-DN0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.n0)); - ref_cl_build_options.add_option("-DK0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.k0)); - ref_cl_build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - ref_cl_build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - // Manually add PostOps - { - ref_cl_build_options.add_option("-DOP=ADD_X_POS_1"); - ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_HEIGHT=" + support::cpp11::to_string(t_dst_info.dimension(1))); - ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_WIDTH=" + support::cpp11::to_string(t_dst_info.dimension(0))); - } - } - cl_code.build_options = ref_cl_build_options; -} } // namespace TEST_SUITE(CL) @@ -185,7 +121,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k }; const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{}; + const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; ArgumentID tid_acc; st = add_tensor_intermed(bp, tid_acc); @@ -197,7 +133,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) st = set_tile_info(bp, store_tile_info); st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info); ClExecutionDescriptor exec_desc; st = tune_static(exec_desc, cl_code); @@ -288,7 +223,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k }; const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{}; + const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; // Create reference SimpleTensor ref_t_lhs{ t_lhs_shape, data_type, 1 }; @@ -344,7 +279,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) st = set_tile_info(bp, store_tile_info); st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info); TOCK(cond0_build_time, measurements) TICK(cond0_tune_time) -- cgit v1.2.1