From bd44caacf15a3b6b059af77e3345f79606067fea Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Tue, 15 Mar 2022 13:45:15 +0000
Subject: [Dynamic Fusion] Implement build options generation

Resolves: COMPMID-5153

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ic34cc1f0d092fafa7c2faa4dd705cf8f68eaf87e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7317
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../dynamic_fusion/ClKernelBuildingImpl/Common.h   |  66 ++++--
 .../components/ClElementwiseAddKernelComponent.cpp |  16 ++
 .../components/ClElementwiseAddKernelComponent.h   |   1 +
 .../components/ClGemmNativeKernelComponent.cpp     | 225 +++++++++++++++------
 .../components/ClGemmNativeKernelComponent.h       |   1 +
 .../components/ClStoreKernelComponents.cpp         |  17 ++
 .../components/ClStoreKernelComponents.h           |   5 +-
 .../CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp   |  70 +------
 8 files changed, 248 insertions(+), 153 deletions(-)
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index b285cc2b54..6e1291cdd5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "src/core/common/Macros.h"
+#include "support/StringSupport.h"
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
 
@@ -191,7 +192,7 @@ public:
     struct TagVal
     {
         TagVal() = default;
-        TagVal(SharedVarTable::SharedVar var)
+        TagVal(const SharedVarTable::SharedVar &var)
             : value{ var.uniq_name }
         {
         }
@@ -201,6 +202,11 @@ public:
         {
         }
 
+        TagVal(const std::string &val)
+            : value{ val }
+        {
+        }
+
         std::string value{};
     };
     using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
@@ -217,12 +223,12 @@ public:
     virtual std::vector<Link> get_links() const          = 0;
     virtual std::string       name() const               = 0;
 
+    // @note: some tags can be unused since they could be used only for the macros, or only for the component code
     static std::string replace_tags(const std::string &code_template, const TagLUT &tags)
     {
-        std::string                     replaced_code = "";
-        std::unordered_set<std::string> used_tags{};
-        bool                            scanning_pattern = false;
-        std::string                     pattern_found    = "";
+        std::string replaced_code    = "";
+        bool        scanning_pattern = false;
+        std::string pattern_found    = "";
         for(size_t i = 0; i < code_template.size() - 1; ++i)
         {
             if(!scanning_pattern)
@@ -247,7 +253,6 @@ public:
                     std::string err  = "Pattern " + pattern_found + " not found in tags";
                     ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
                     replaced_code += tags.find(pattern_found)->second.value;
-                    used_tags.insert(pattern_found);
                 }
                 else
                 {
@@ -255,12 +260,7 @@ public:
                 }
             }
         }
-        // Check for unused tags
-        for(const auto &tag : tags)
-        {
-            ARM_COMPUTE_UNUSED(tag);
-            ARM_COMPUTE_ERROR_ON_MSG(used_tags.find(tag.first) == used_tags.end(), "Warning: unused tags");
-        }
+
         return replaced_code;
     }
     ComponentID id() const
@@ -303,6 +303,11 @@ public:
         return "";
     }
 
+    virtual CLBuildOptions generate_build_options() const
+    {
+        return CLBuildOptions{};
+    }
+
 protected:
     const ClKernelBlueprint *_blueprint;
 
@@ -445,12 +450,10 @@ public:
     {
         std::string name = "";
 
-        auto stack = topological_sort();
-        while(!stack.empty())
+        traverse([&](std::stack<ComponentID> stack)
         {
             name += _components.find(stack.top())->second->name() + (stack.size() > 2 ? "___" : "");
-            stack.pop();
-        }
+        });
 
         return name;
     }
@@ -480,7 +483,7 @@ public:
             headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
             if(!curr_additional_macros.empty()) // Some components might not have any
             {
-                additional_macros.insert(curr_additional_macros);
+                additional_macros.insert(IClKernelComponent::replace_tags(curr_additional_macros, var_lut));
             }
 
             stack.pop();
@@ -524,7 +527,19 @@ public:
 
     CLBuildOptions build_options() const
     {
-        return CLBuildOptions{};
+        CLBuildOptions build_opts{};
+
+        traverse([&](std::stack<ComponentID> stack)
+        {
+            build_opts.add_options(_components.find(stack.top())->second->generate_build_options().options());
+        });
+
+        return build_opts;
+    }
+
+    TileDescriptor get_tile_info() const
+    {
+        return _tile_info;
     }
 
     Window get_execution_window() const
@@ -596,6 +611,17 @@ private:
         return stack;
     }
 
+    void traverse(const std::function<void(std::stack<ComponentID>)> &func) const
+    {
+        std::stack<ComponentID> stack = topological_sort();
+
+        while(!stack.empty())
+        {
+            func(stack);
+            stack.pop();
+        }
+    }
+
     std::string generate_argument_declaration(const SharedVarTable::SharedVar &var) const
     {
         ARM_COMPUTE_ERROR_ON_MSG(var.group != SharedVarGroup::Argument, "An argument declaration can only be generated from a kernel argument");
@@ -672,7 +698,7 @@ private:
                 ARM_COMPUTE_ERROR("Unsupported clipping strategy");
         }
 
-        code += "\n    REPEAT_VAR_INIT_TO_CONST(M0, uint, g_zout, 0);\n";
+        code += "\n    REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n";
         code += "    REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n";
 
         return code;
@@ -684,7 +710,7 @@ private:
     int32_t _num_components{};
     int32_t _num_complex_components{};
 
-    ArgumentID _dst_id{ -1 };
+    ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
 
     // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
     std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 06c29c4253..bbdf8df0a3 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -82,6 +82,22 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const
 
 )_";
 }
+
+CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
+{
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    auto tile_info  = _blueprint->impl().get_tile_info();
+
+    CLBuildOptions build_opts{};
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+
+    return build_opts;
+}
+
 ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
 {
     // Determine which argument is the accumulator
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index fe5f964c54..c259811a98 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -46,6 +46,7 @@ public:
     std::set<std::string> get_headers_list() const override;
     std::string           get_component_code() const override;
     Window                get_window() const override;
+    CLBuildOptions        generate_build_options() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index e70e5d5ea5..4bf0b76c3a 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -28,6 +28,9 @@
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/utils/helpers/float_ops.h"
+#include "support/StringSupport.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -214,6 +217,13 @@ std::string ClGemmNativeKernelComponent::get_additional_macros() const
 
 std::string ClGemmNativeKernelComponent::get_component_code() const
 {
+    auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+
+    auto has_alpha               = !(helpers::float_ops::is_one(_desc.alpha));
+    auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
+    auto dont_slide_b            = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
+
     std::string code = R"_(
     //------------------ START KERNEL {{meta_kernel_id}} ---------------------
     // IN_0(lhs)            {{lhs}}
@@ -245,34 +255,49 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
 
         // Compute RHS matrix address
         uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE);
+    )_";
 
-#if defined(MATRIX_B_DEPTH)
-        // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-        rhs_offset += (g_z % MATRIX_B_DEPTH) * {{rhs}}_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-        rhs_offset += g_z * {{rhs}}_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
+    if(dont_slide_b)
+    {
+        code += R"_(
+            // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+            rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z;
+        )_";
+    }
+    else
+    {
+        code += R"_(
+            rhs_offset += g_z * {{rhs}}_stride_z;
+        )_";
+    }
 
+    code += R"_(
         REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    )_";
 
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
-        CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
-
-        // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-        // multiply lhs_stride_z by DEPTH_GEMM3D
-        lhs_offset += g_z * {{lhs}}_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Add offset for batched GEMM
-        lhs_offset += g_z * {{lhs}}_stride_z;
+    if(reinterpret_input_as_3d)
+    {
+        code += R"_(
+            // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
+            CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
 
-#endif // defined(REINTERPRET_INPUT_AS_3D)
+            // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+            // multiply lhs_stride_z by DEPTH_GEMM3D
+            lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}};
+        )_";
+    }
+    else
+    {
+        code += R"_(
+            // Add offset for batched GEMM
+            lhs_offset += g_z * {{lhs}}_stride_z;
+        )_";
+    }
 
+    code += R"_(
         int i = 0;
-#if K0 > 1
-        for(; i <= (K - K0); i += K0)
+#if {{K0}} > 1
+        for(; i <= (K - {{K0}}); i += {{K0}})
         {
             // Supported cases (M0, K0):
             // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
@@ -284,26 +309,26 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
             // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
             // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
             // Load values from LHS matrix
-            LOAD_BLOCK(M0, K0, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
+            LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
 
             // Load values from RHS matrix
-            LOAD_BLOCK(K0, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
+            LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
 
             RHS_VFMA_M0xN0(0, a, b0, {{dst}});
             RHS_VFMA_M0xN0(1, a, b1, {{dst}});
-#if K0 > 2
+#if {{K0}} > 2
             RHS_VFMA_M0xN0(2, a, b2, {{dst}});
 #endif // K0 > 2
-#if K0 > 3
+#if {{K0}} > 3
             RHS_VFMA_M0xN0(3, a, b3, {{dst}});
 #endif // K0 > 3
-#if K0 > 4
+#if {{K0}} > 4
             RHS_VFMA_M0xN0(4, a, b4, {{dst}});
             RHS_VFMA_M0xN0(5, a, b5, {{dst}});
             RHS_VFMA_M0xN0(6, a, b6, {{dst}});
             RHS_VFMA_M0xN0(7, a, b7, {{dst}});
 #endif // K0 > 4
-#if K0 > 8
+#if {{K0}} > 8
             RHS_VFMA_M0xN0(8, a, b8, {{dst}});
             RHS_VFMA_M0xN0(9, a, b9, {{dst}});
             RHS_VFMA_M0xN0(A, a, bA, {{dst}});
@@ -314,8 +339,8 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
             RHS_VFMA_M0xN0(F, a, bF, {{dst}});
 #endif // K0 > 8
 
-            lhs_offset += K0 * sizeof(DATA_TYPE);
-            rhs_offset += K0 * {{rhs}}_stride_y;
+            lhs_offset += {{K0}} * sizeof(DATA_TYPE);
+            rhs_offset += {{K0}} * {{rhs}}_stride_y;
         }
 #endif // K0 > 1
         // Left-over accumulations
@@ -362,44 +387,61 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
         }
 
         // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-        SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, ALPHA);
-#endif // defined(ALPHA)
     )_";
-
-    if(!_bias.is_empty())
+    if(has_alpha)
     {
         code += R"_(
-        // Add beta*bias
-#if defined(BROADCAST_BIAS)
-        __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-        LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
-
-#ifndef UNIT_BETA
-        SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-        // c = c + bias[broadcasted]
-        ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
-
-#else // defined(BROADCAST_BIAS)
-        __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
-                                    PARTIAL_STORE_M0)
-                                    * {{bias}}_stride_y)
-                                    + g_z * {{bias}}_stride_z;
-
-        LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
-
-#ifndef UNIT_BETA
-        SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-        // c = c + bias
-        ADD_BLOCK(M0, {{dst}}, bias);
+            SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}});
+        )_";
+    }
 
-#endif // defined(BROADCAST_BIAS)
-    )_";
+    if(!_bias.is_empty())
+    {
+        if(_desc.broadcast_bias)
+        {
+            code += R"_(
+                // Add beta*bias
+                __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+                LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+            )_";
+
+            if(helpers::float_ops::is_one(_desc.beta))
+            {
+                code += R"_(
+                    SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}});
+                )_";
+            }
+
+            code += R"_(
+                // c = c + bias[broadcasted]
+                ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
+            )_";
+        }
+        else
+        {
+            code += R"_(
+                // Add beta*bias
+                __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
+                                            PARTIAL_STORE_M0)
+                                            * {{bias}}_stride_y)
+                                            + g_z * {{bias}}_stride_z;
+
+                LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
+            )_";
+
+            if(helpers::float_ops::is_one(_desc.beta))
+            {
+                code += R"_(
+                    SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}});
+                )_";
+            }
+
+            code += R"_(
+                // c = c + bias
+                ADD_BLOCK(M0, {{dst}}, bias);
+            )_";
+        }
     }
 
     code += R"_(
@@ -409,6 +451,25 @@ std::string ClGemmNativeKernelComponent::get_component_code() const
     return code.c_str();
 }
 
+CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
+{
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    auto tile_info  = _blueprint->impl().get_tile_info();
+
+    CLBuildOptions build_opts{};
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+    build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y()));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x()));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+
+    return build_opts;
+}
+
 ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
 {
     TagLUT lut{};
@@ -421,6 +482,44 @@ ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(S
         lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias");
     }
     lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst");
+
+    // Local build options
+    auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    auto has_alpha                = !(helpers::float_ops::is_one(_desc.alpha));
+    auto has_beta                 = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr;
+    auto reinterpret_input_as_3d  = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
+    auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0;
+    auto dont_slide_b             = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
+
+    lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0);
+
+    if(has_alpha)
+    {
+        lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha);
+    }
+    if(has_beta)
+    {
+        lut["BETA"] = float_to_string_with_full_precision(_desc.beta);
+    }
+    if(dont_slide_b)
+    {
+        lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2));
+    }
+
+    if(reinterpret_output_as_3d)
+    {
+        lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1));
+        lut["DEPTH_GEMM3D"]  = support::cpp11::to_string(t_dst_info->dimension(2));
+    }
+    else if(reinterpret_input_as_3d)
+    {
+        lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1));
+        lut["DEPTH_GEMM3D"]  = support::cpp11::to_string(t_lhs_info->dimension(2));
+    }
+
     return lut;
 }
 } // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 09933a8932..1a1e3e3ce6 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -52,6 +52,7 @@ public:
     std::string           get_component_code() const override;
     Window                get_window() const override;
     ClKernelArgList       get_args();
+    CLBuildOptions        generate_build_options() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 430fafb89f..2d7b46616f 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
@@ -61,6 +61,23 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const
 
 )_";
 }
+
+CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
+{
+    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    auto tile_info  = _blueprint->impl().get_tile_info();
+
+    CLBuildOptions build_opts{};
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+
+    return build_opts;
+}
+
 ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const
 {
     return {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index ad7a207ef8..8d58da2a0d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -41,8 +41,9 @@ public:
         : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
-    ComponentType get_component_type() const override;
-    std::string   get_component_code() const override;
+    ComponentType  get_component_type() const override;
+    std::string    get_component_code() const override;
+    CLBuildOptions generate_build_options() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index 753e0a4625..cb365901da 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -75,70 +75,6 @@ void fill(U &&tensor, int seed)
     DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
     library->fill_borders_with_garbage(tensor, distribution_inf, seed);
 }
-
-void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc,
-                       const TensorInfo &t_lhs_info,
-                       const TensorInfo &t_rhs_info,
-                       const TensorInfo *t_bias_info,
-                       const TensorInfo &t_dst_info)
-{
-    CLBuildOptions ref_cl_build_options;
-    {
-        // If reinterpret_input_as_3d = reinterpret_output_as_3d = true,
-        // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-        // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
-        auto reinterpret_input_as_3d  = gemm_native_desc.reinterpret_input_as_3d;
-        auto reinterpret_output_as_3d = gemm_native_desc.depth_output_gemm3d != 0;
-        auto _slide_matrix_b          = (t_rhs_info.num_dimensions() >= t_lhs_info.num_dimensions());
-        auto _use_dummy_work_items    = false;
-        // In case both input and dst have to be reinterpreted as 3D tensors,
-        // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-        if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-        {
-            reinterpret_input_as_3d  = false;
-            reinterpret_output_as_3d = false;
-        }
-
-        const unsigned int internal_m = reinterpret_output_as_3d ? gemm_native_desc.m : t_dst_info.dimension(1);
-
-        const unsigned int h_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(1) : t_lhs_info.dimension(1);
-        const unsigned int d_gemm_3d = reinterpret_output_as_3d ? t_dst_info.dimension(2) : t_lhs_info.dimension(2);
-
-        // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-        const unsigned int partial_store_m0 = internal_m % gemm_native_desc.lhs_info.m0;
-        const unsigned int partial_store_n0 = gemm_native_desc.n % gemm_native_desc.rhs_info.n0;
-
-        // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-        const unsigned int internal_m0 = std::min(internal_m, gemm_native_desc.lhs_info.m0);
-
-        ref_cl_build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info.data_type()));
-        ref_cl_build_options.add_option_if(!(helpers::float_ops::is_one(gemm_native_desc.alpha)), "-DALPHA=" + float_to_string_with_full_precision(gemm_native_desc.alpha));
-        ref_cl_build_options.add_option_if(t_bias_info != nullptr, "-DBETA=" + float_to_string_with_full_precision(gemm_native_desc.beta));
-        ref_cl_build_options.add_option_if(helpers::float_ops::is_one(gemm_native_desc.beta), "-DUNIT_BETA");
-        ref_cl_build_options.add_option_if(gemm_native_desc.broadcast_bias, "-DBROADCAST_BIAS");
-        ref_cl_build_options.add_option_if(reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-        ref_cl_build_options.add_option_if(reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        ref_cl_build_options.add_option_if(reinterpret_input_as_3d || reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        ref_cl_build_options.add_option_if(reinterpret_input_as_3d || reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-        ref_cl_build_options.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(t_rhs_info.dimension(2)));
-        ref_cl_build_options.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-        ref_cl_build_options.add_option("-DM=" + support::cpp11::to_string(internal_m));
-        ref_cl_build_options.add_option("-DN=" + support::cpp11::to_string(gemm_native_desc.n));
-        ref_cl_build_options.add_option("-DK=" + support::cpp11::to_string(gemm_native_desc.k));
-        ref_cl_build_options.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-        ref_cl_build_options.add_option("-DN0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.n0));
-        ref_cl_build_options.add_option("-DK0=" + support::cpp11::to_string(gemm_native_desc.rhs_info.k0));
-        ref_cl_build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        ref_cl_build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-        // Manually add PostOps
-        {
-            ref_cl_build_options.add_option("-DOP=ADD_X_POS_1");
-            ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_HEIGHT=" + support::cpp11::to_string(t_dst_info.dimension(1)));
-            ref_cl_build_options.add_option("-DP2_ELTWISE_ARG1_WIDTH=" + support::cpp11::to_string(t_dst_info.dimension(0)));
-        }
-    }
-    cl_code.build_options = ref_cl_build_options;
-}
 } // namespace
 
 TEST_SUITE(CL)
@@ -185,7 +121,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
     const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k };
     const GEMMKernelInfo       gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
     const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
-    const TileDescriptor       store_tile_info{};
+    const TileDescriptor       store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
 
     ArgumentID tid_acc;
     st = add_tensor_intermed(bp, tid_acc);
@@ -197,7 +133,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
 
     st = set_tile_info(bp, store_tile_info);
     st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
-    set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
 
     ClExecutionDescriptor exec_desc;
     st = tune_static(exec_desc, cl_code);
@@ -288,7 +223,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
     const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
     const GEMMKernelInfo       gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
     const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
-    const TileDescriptor       store_tile_info{};
+    const TileDescriptor       store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
 
     // Create reference
     SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
@@ -344,7 +279,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
 
         st = set_tile_info(bp, store_tile_info);
         st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
-        set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
         TOCK(cond0_build_time, measurements)
 
         TICK(cond0_tune_time)
-- 
cgit v1.2.1