aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2022-03-15 13:45:15 +0000
committerGiorgio Arena <giorgio.arena@arm.com>2022-03-24 12:51:30 +0000
commitbd44caacf15a3b6b059af77e3345f79606067fea (patch)
treefc0235aee09359d9bc0cc9d2ca60b22c20abd148
parent09adcc4142c95bdac66f02abd099ac4751e2f40f (diff)
downloadComputeLibrary-bd44caacf15a3b6b059af77e3345f79606067fea.tar.gz
[Dynamic Fusion] Implement build options generation
Resolves: COMPMID-5153 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: Ic34cc1f0d092fafa7c2faa4dd705cf8f68eaf87e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7317 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h66
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp16
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h1
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp225
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h1
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp17
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h5
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp70
8 files changed, 248 insertions, 153 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index b285cc2b54..6e1291cdd5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -30,6 +30,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GPUTarget.h"
#include "src/core/common/Macros.h"
+#include "support/StringSupport.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
@@ -191,7 +192,7 @@ public:
struct TagVal
{
TagVal() = default;
- TagVal(SharedVarTable::SharedVar var)
+ TagVal(const SharedVarTable::SharedVar &var)
: value{ var.uniq_name }
{
}
@@ -201,6 +202,11 @@ public:
{
}
+ TagVal(const std::string &val)
+ : value{ val }
+ {
+ }
+
std::string value{};
};
using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
@@ -217,12 +223,12 @@ public:
virtual std::vector<Link> get_links() const = 0;
virtual std::string name() const = 0;
+ // @note: some tags can be unused since they could be used only for the macros, or only for the component code
static std::string replace_tags(const std::string &code_template, const TagLUT &tags)
{
- std::string replaced_code = "";
- std::unordered_set<std::string> used_tags{};
- bool scanning_pattern = false;
- std::string pattern_found = "";
+ std::string replaced_code = "";
+ bool scanning_pattern = false;
+ std::string pattern_found = "";
for(size_t i = 0; i < code_template.size() - 1; ++i)
{
if(!scanning_pattern)
@@ -247,7 +253,6 @@ public:
std::string err = "Pattern " + pattern_found + " not found in tags";
ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
replaced_code += tags.find(pattern_found)->second.value;
- used_tags.insert(pattern_found);
}
else
{
@@ -255,12 +260,7 @@ public:
}
}
}
- // Check for unused tags
- for(const auto &tag : tags)
- {
- ARM_COMPUTE_UNUSED(tag);
- ARM_COMPUTE_ERROR_ON_MSG(used_tags.find(tag.first) == used_tags.end(), "Warning: unused tags");
- }
+
return replaced_code;
}
ComponentID id() const
@@ -303,6 +303,11 @@ public:
return "";
}
+ virtual CLBuildOptions generate_build_options() const
+ {
+ return CLBuildOptions{};
+ }
+
protected:
const ClKernelBlueprint *_blueprint;
@@ -445,12 +450,10 @@ public:
{
std::string name = "";
- auto stack = topological_sort();
- while(!stack.empty())
+ traverse([&](std::stack<ComponentID> stack)
{
name += _components.find(stack.top())->second->name() + (stack.size() > 2 ? "___" : "");
- stack.pop();
- }
+ });
return name;
}
@@ -480,7 +483,7 @@ public:
headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
if(!curr_additional_macros.empty()) // Some components might not have any
{
- additional_macros.insert(curr_additional_macros);
+ additional_macros.insert(IClKernelComponent::replace_tags(curr_additional_macros, var_lut));
}
stack.pop();
@@ -524,7 +527,19 @@ public:
CLBuildOptions build_options() const
{
- return CLBuildOptions{};
+ CLBuildOptions build_opts{};
+
+ traverse([&](std::stack<ComponentID> stack)
+ {
+ build_opts.add_options(_components.find(stack.top())->second->generate_build_options().options());
+ });
+
+ return build_opts;
+ }
+
+ TileDescriptor get_tile_info() const
+ {
+ return _tile_info;
}
Window get_execution_window() const
@@ -596,6 +611,17 @@ private:
return stack;
}
+ void traverse(const std::function<void(std::stack<ComponentID>)> &func) const
+ {
+ std::stack<ComponentID> stack = topological_sort();
+
+ while(!stack.empty())
+ {
+ func(stack);
+ stack.pop();
+ }
+ }
+
std::string generate_argument_declaration(const SharedVarTable::SharedVar &var) const
{
ARM_COMPUTE_ERROR_ON_MSG(var.group != SharedVarGroup::Argument, "An argument declaration can only be generated from a kernel argument");
@@ -672,7 +698,7 @@ private:
ARM_COMPUTE_ERROR("Unsupported clipping strategy");
}
- code += "\n REPEAT_VAR_INIT_TO_CONST(M0, uint, g_zout, 0);\n";
+ code += "\n REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n";
code += " REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n";
return code;
@@ -684,7 +710,7 @@ private:
int32_t _num_components{};
int32_t _num_complex_components{};
- ArgumentID _dst_id{ -1 };
+ ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
// Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 06c29c4253..bbdf8df0a3 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -82,6 +82,22 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const
)_";
}
+
+CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
+{
+ auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ auto tile_info = _blueprint->impl().get_tile_info();
+
+ CLBuildOptions build_opts{};
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+
+ return build_opts;
+}
+
ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
{
// Determine which argument is the accumulator
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index fe5f964c54..c259811a98 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -46,6 +46,7 @@ public:
std::set<std::string> get_headers_list() const override;
std::string get_component_code() const override;
Window get_window() const override;
+ CLBuildOptions generate_build_options() const override;
virtual std::vector<Link> get_links() const override
{
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index e70e5d5ea5..4bf0b76c3a 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -28,6 +28,9 @@
#include "src/core/AccessWindowStatic.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "support/StringSupport.h"
+
namespace arm_compute
{
namespace experimental
@@ -214,6 +217,13 @@ std::string ClGemmNativeKernelComponent::get_additional_macros() const
std::string ClGemmNativeKernelComponent::get_component_code() const
{
+ auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+ auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+
+ auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha));
+ auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
+ auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
+
std::string code = R"_(
//------------------ START KERNEL {{meta_kernel_id}} ---------------------
// IN_0(lhs) {{lhs}}
@@ -245,34 +255,49 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
// Compute RHS matrix address
uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE);
+ )_";
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (g_z % MATRIX_B_DEPTH) * {{rhs}}_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- rhs_offset += g_z * {{rhs}}_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
+ if(dont_slide_b)
+ {
+ code += R"_(
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z;
+ )_";
+ }
+ else
+ {
+ code += R"_(
+ rhs_offset += g_z * {{rhs}}_stride_z;
+ )_";
+ }
+ code += R"_(
REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+ )_";
-#if defined(REINTERPRET_INPUT_AS_3D)
- // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += g_z * {{lhs}}_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- lhs_offset += g_z * {{lhs}}_stride_z;
+ if(reinterpret_input_as_3d)
+ {
+ code += R"_(
+ // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
-#endif // defined(REINTERPRET_INPUT_AS_3D)
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}};
+ )_";
+ }
+ else
+ {
+ code += R"_(
+ // Add offset for batched GEMM
+ lhs_offset += g_z * {{lhs}}_stride_z;
+ )_";
+ }
+ code += R"_(
int i = 0;
-#if K0 > 1
- for(; i <= (K - K0); i += K0)
+#if {{K0}} > 1
+ for(; i <= (K - {{K0}}); i += {{K0}})
{
// Supported cases (M0, K0):
// 1,2 - 1,3 - 1,4 - 1,8 - 1,16
@@ -284,26 +309,26 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
// 7,2 - 7,3 - 7,4 - 7,8 - 7,16
// 8,2 - 8,3 - 8,4 - 8,8 - 8,16
// Load values from LHS matrix
- LOAD_BLOCK(M0, K0, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
+ LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
// Load values from RHS matrix
- LOAD_BLOCK(K0, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
+ LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
RHS_VFMA_M0xN0(0, a, b0, {{dst}});
RHS_VFMA_M0xN0(1, a, b1, {{dst}});
-#if K0 > 2
+#if {{K0}} > 2
RHS_VFMA_M0xN0(2, a, b2, {{dst}});
#endif // K0 > 2
-#if K0 > 3
+#if {{K0}} > 3
RHS_VFMA_M0xN0(3, a, b3, {{dst}});
#endif // K0 > 3
-#if K0 > 4
+#if {{K0}} > 4
RHS_VFMA_M0xN0(4, a, b4, {{dst}});
RHS_VFMA_M0xN0(5, a, b5, {{dst}});
RHS_VFMA_M0xN0(6, a, b6, {{dst}});
RHS_VFMA_M0xN0(7, a, b7, {{dst}});
#endif // K0 > 4
-#if K0 > 8
+#if {{K0}} > 8
RHS_VFMA_M0xN0(8, a, b8, {{dst}});
RHS_VFMA_M0xN0(9, a, b9, {{dst}});
RHS_VFMA_M0xN0(A, a, bA, {{dst}});
@@ -314,8 +339,8 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
RHS_VFMA_M0xN0(F, a, bF, {{dst}});
#endif // K0 > 8
- lhs_offset += K0 * sizeof(DATA_TYPE);
- rhs_offset += K0 * {{rhs}}_stride_y;
+ lhs_offset += {{K0}} * sizeof(DATA_TYPE);
+ rhs_offset += {{K0}} * {{rhs}}_stride_y;
}
#endif // K0 > 1
// Left-over accumulations
@@ -362,44 +387,61 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
}
// Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
- SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, ALPHA);
-#endif // defined(ALPHA)
)_";
-
- if(!_bias.is_empty())
+ if(has_alpha)
{
code += R"_(
- // Add beta*bias
-#if defined(BROADCAST_BIAS)
- __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
-
-#else // defined(BROADCAST_BIAS)
- __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
- PARTIAL_STORE_M0)
- * {{bias}}_stride_y)
- + g_z * {{bias}}_stride_z;
-
- LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
-
-#ifndef UNIT_BETA
- SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
- // c = c + bias
- ADD_BLOCK(M0, {{dst}}, bias);
+ SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}});
+ )_";
+ }
-#endif // defined(BROADCAST_BIAS)
- )_";
+ if(!_bias.is_empty())
+ {
+ if(_desc.broadcast_bias)
+ {
+ code += R"_(
+ // Add beta*bias
+ __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+ )_";
+
+ if(helpers::float_ops::is_one(_desc.beta))
+ {
+ code += R"_(
+ SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}});
+ )_";
+ }
+
+ code += R"_(
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
+ )_";
+ }
+ else
+ {
+ code += R"_(
+ // Add beta*bias
+ __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
+ PARTIAL_STORE_M0)
+ * {{bias}}_stride_y)
+ + g_z * {{bias}}_stride_z;
+
+ LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+ )_";
+
+ if(helpers::float_ops::is_one(_desc.beta))
+ {
+ code += R"_(
+ SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}});
+ )_";
+ }
+
+ code += R"_(
+ // c = c + bias
+ ADD_BLOCK(M0, {{dst}}, bias);
+ )_";
+ }
}
code += R"_(
@@ -409,6 +451,25 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
return code.c_str();
}
+CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
+{
+ auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ auto tile_info = _blueprint->impl().get_tile_info();
+
+ CLBuildOptions build_opts{};
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+ build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x()));
+ build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+
+ return build_opts;
+}
+
ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
{
TagLUT lut{};
@@ -421,6 +482,44 @@ ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(S
lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias");
}
lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst");
+
+ // Local build options
+ auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+ auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+ auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha));
+ auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr;
+ auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
+ auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0;
+ auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
+
+ lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0);
+
+ if(has_alpha)
+ {
+ lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha);
+ }
+ if(has_beta)
+ {
+ lut["BETA"] = float_to_string_with_full_precision(_desc.beta);
+ }
+ if(dont_slide_b)
+ {
+ lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2));
+ }
+
+ if(reinterpret_output_as_3d)
+ {
+ lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1));
+ lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2));
+ }
+ else if(reinterpret_input_as_3d)
+ {
+ lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1));
+ lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2));
+ }
+
return lut;
}
} // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 09933a8932..1a1e3e3ce6 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -52,6 +52,7 @@ public:
std::string get_component_code() const override;
Window get_window() const override;
ClKernelArgList get_args();
+ CLBuildOptions generate_build_options() const override;
virtual std::vector<Link> get_links() const override
{
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 430fafb89f..2d7b46616f 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
@@ -61,6 +61,23 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const
)_";
}
+
+CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
+{
+ auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ auto tile_info = _blueprint->impl().get_tile_info();
+
+ CLBuildOptions build_opts{};
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+
+ return build_opts;
+}
+
ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const
{
return {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index ad7a207ef8..8d58da2a0d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -41,8 +41,9 @@ public:
: IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
{
}
- ComponentType get_component_type() const override;
- std::string get_component_code() const override;
+ ComponentType get_component_type() const override;
+ std::string get_component_code() const override;
+ CLBuildOptions generate_build_options() const override;
virtual std::vector<Link> get_links() const override
{
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index 753e0a4625..cb365901da 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -75,70 +75,6 @@ void fill(U &&tensor, int seed)
DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
library->fill_borders_with_garbage(tensor, distribution_inf, seed);
}
-
-void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc,
- const TensorInfo &t_lhs_info,
- const TensorInfo &t_rhs_info,
- const TensorInfo *t_bias_info,
- const TensorInfo &t_dst_info)
-{
- CLBuildOptions ref_cl_build_options;
- {
- // If reinterpret_input_as_3d = reinterpret_output_as_3d = true,
- // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
- // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
- auto reinterpret_input_as_3d = gemm_native_desc.reinterpret_input_as_3d;
- auto reinterpret_output_as_3d = gemm_native_desc.depth_output_gemm3d != 0;
- auto _slide_matrix_b = (t_rhs_info.num_dimensions() >= t_lhs_info.num_dimensions());
- auto _use_dummy_work_items = false;
- // In case both input and dst have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_input_as_3d = false;
- reinterpret_output_as_3d = false;
- }
-
- const unsigned int internal_m = reinterpret_output_as_3d ? gemm_native_desc.m : t_dst_info.dimension(1);
-
- const unsigned int h_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(1) : t_lhs_info.dimension(1);
- const unsigned int d_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(2) : t_lhs_info.dimension(2);
-
- // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
- const unsigned int partial_store_m0 = internal_m % gemm_native_desc.lhs_info.m0;
- const unsigned int partial_store_n0 = gemm_native_desc.n % gemm_native_desc.rhs_info.n0;
-
- // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
- const unsigned int internal_m0 = std::min(internal_m, gemm_native_desc.lhs_info.m0);
-
- ref_cl_build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info.data_type()));
- ref_cl_build_options.add_option_if(!(helpers::float_ops::is_one(gemm_native_desc.alpha)), "-DALPHA=" + float_to_string_with_full_precision(gemm_native_desc.alpha));
- ref_cl_build_options.add_option_if(t_bias_info != nullptr, "-DBETA=" + float_to_string_with_full_precision(gemm_native_desc.beta));
- ref_cl_build_options.add_option_if(helpers::float_ops::is_one(gemm_native_desc.beta), "-DUNIT_BETA");
- ref_cl_build_options.add_option_if(gemm_native_desc.broadcast_bias, "-DBROADCAST_BIAS");
- ref_cl_build_options.add_option_if(reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- ref_cl_build_options.add_option_if(reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- ref_cl_build_options.add_option_if(reinterpret_input_as_3d || reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
- ref_cl_build_options.add_option_if(reinterpret_input_as_3d || reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
- ref_cl_build_options.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(t_rhs_info.dimension(2)));
- ref_cl_build_options.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
- ref_cl_build_options.add_option("-DM=" + support::cpp11::to_string(internal_m));
- ref_cl_build_options.add_option("-DN=" + support::cpp11::to_string(gemm_native_desc.n));
- ref_cl_build_options.add_option("-DK=" + support::cpp11::to_string(gemm_native_desc.k));
- ref_cl_build_options.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
- ref_cl_build_options.add_option("-DN0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.n0));
- ref_cl_build_options.add_option("-DK0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.k0));
- ref_cl_build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
- ref_cl_build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- // Manually add PostOps
- {
- ref_cl_build_options.add_option("-DOP=ADD_X_POS_1");
- ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_HEIGHT=" + support::cpp11::to_string(t_dst_info.dimension(1)));
- ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_WIDTH=" + support::cpp11::to_string(t_dst_info.dimension(0)));
- }
- }
- cl_code.build_options = ref_cl_build_options;
-}
} // namespace
TEST_SUITE(CL)
@@ -185,7 +121,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k };
const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
- const TileDescriptor store_tile_info{};
+ const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
ArgumentID tid_acc;
st = add_tensor_intermed(bp, tid_acc);
@@ -197,7 +133,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
st = set_tile_info(bp, store_tile_info);
st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
- set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
ClExecutionDescriptor exec_desc;
st = tune_static(exec_desc, cl_code);
@@ -288,7 +223,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
- const TileDescriptor store_tile_info{};
+ const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
// Create reference
SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
@@ -344,7 +279,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
st = set_tile_info(bp, store_tile_info);
st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
- set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
TOCK(cond0_build_time, measurements)
TICK(cond0_tune_time)