9 files changed, 198 insertions, 114 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index e40f9c6da9..6db1ca4cf5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
@@ -61,13 +61,15 @@ Status add_tensor_intermed(ClKernelBlueprint &kernel_blueprint, ArgumentID &id)
     return Status{};
 }
 
-Status add_kcomp_gemm_native(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
+Status add_kcomp_gemm_native(ClKernelBlueprint          &kernel_blueprint, const ClKernelComponentDescriptor &,
+                             const GemmNativeDescriptor &gemm_native_desc,
                              ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id)
 {
     kernel_blueprint.impl().validate_arg_ids({ lhs_id, rhs_id, bias_id, dst_id });
-
     kernel_blueprint.impl().add_component(
         std::make_unique<ClGemmNativeKernelComponent>(
+            &kernel_blueprint,
+            gemm_native_desc,
             SharedVarLink{ lhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(lhs_id) },
             SharedVarLink{ rhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(rhs_id) },
             SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
@@ -81,6 +83,7 @@ Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClKernel
 {
     kernel_blueprint.impl().add_component(
         std::make_unique<ClElementwiseAddKernelComponent>(
+            &kernel_blueprint,
             SharedVarLink{ src0_id, SharedVarIO::Input, kernel_blueprint.impl().group(src0_id) },
             SharedVarLink{ src1_id, SharedVarIO::Input, kernel_blueprint.impl().group(src1_id) },
             SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }));
@@ -98,6 +101,7 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon
         case StoreType::StoreBlockBoundaryAware:
             kernel_blueprint.impl().add_component(
                 std::make_unique<ClStoreBlockBoundaryAwareKernelComponent>(
+                    &kernel_blueprint,
                     SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
                     SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
             break;
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 15622c848d..27ab294cc9 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
@@ -100,13 +100,13 @@ struct ClKernelComponentDescriptor
 /** Component: Tensor Argument */
 struct ClTensorDescriptor
 {
-    ClTensorDescriptor(const ITensorInfo *info, unsigned int dim)
+    ClTensorDescriptor(ITensorInfo *info, unsigned int dim)
         : tensor_info(info), slice_dim(dim)
     {
     }
 
-    const ITensorInfo *tensor_info;
-    unsigned int       slice_dim;
+    ITensorInfo *tensor_info;
+    unsigned int slice_dim;
 };
 
 Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &);
@@ -133,8 +133,8 @@ struct GemmNativeDescriptor
     int32_t           b_offset{};
 };
 
-Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, ArgumentID input_id,
-                             ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id);
+Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
+                             ArgumentID input_id, ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id);
 
 /** Component: Eltwise Add */
 struct EltwiseAddDescriptor
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index 3b5160a055..b285cc2b54 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/common/Macros.h"
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
 
@@ -63,8 +64,8 @@ enum class SharedVarGroup
     Automatic // Automatic variables declared within the kernel body
 };
 
-/** Specifies a shared variable ink for a component.
- * It describes all the information that's availbale when a component is constructed / added:
+/** Specifies a shared variable link for a component.
+ * It describes all the information that's available when a component is constructed / added:
  *  e.g. its linkage (via ArgumentID and io) and its group
  * This is not shared variable on its own, but is used for instantiating a SharedVar when building the code
  */
@@ -204,6 +205,13 @@ public:
     };
     using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
 public:
+    IClKernelComponent(const ClKernelBlueprint *blueprint)
+        : _blueprint(blueprint)
+    {
+    }
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClKernelComponent);
+
     virtual ~IClKernelComponent()                        = default;
     virtual ComponentType     get_component_type() const = 0;
     virtual std::vector<Link> get_links() const          = 0;
@@ -278,6 +286,11 @@ public:
     {
         return "";
     }
+
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
     /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code
      *
      * @param vtable
@@ -290,6 +303,9 @@ public:
         return "";
     }
 
+protected:
+    const ClKernelBlueprint *_blueprint;
+
 private:
     ComponentID _id{};
 };
@@ -398,6 +414,12 @@ public:
             // Additionally, set this component as one that treats this argument as "Output" (append to index 1)
             else
             {
+                if(component->get_component_type() == ComponentType::Store)
+                {
+                    ARM_COMPUTE_ERROR_ON_MSG(_dst_id >= 0, "Trying to add more than one dst argument to the graph");
+                    _dst_id = arg_id;
+                }
+
                 for(const auto &subseq_component : _outgoing_components[arg_id])
                 {
                     _component_graph[component_id].push_back(subseq_component);
@@ -430,7 +452,6 @@ public:
             stack.pop();
         }
 
-        std::cout << name << std::endl;
         return name;
     }
 
@@ -508,7 +529,15 @@ public:
 
     Window get_execution_window() const
     {
-        return Window{};
+        ARM_COMPUTE_ERROR_ON_MSG(_graph_root < 0, "No root found in the component graph");
+        ARM_COMPUTE_ERROR_ON_MSG(_dst_id == -1, "Destination Tensor Id should be ready before calling get_execution_window()");
+
+        return _components.find(_graph_root)->second->get_window();
+    }
+
+    ArgumentID get_dst_id() const
+    {
+        return _dst_id;
     }
 
     ClKernelArgList get_arguments() const
@@ -521,6 +550,26 @@ public:
         return arg_list;
     }
 
+    const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const
+    {
+        auto it = _kernel_arguments.find(id);
+        if(it != _kernel_arguments.end())
+        {
+            return &_kernel_arguments.find(id)->second;
+        }
+        return nullptr;
+    }
+
+    ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+    {
+        const ClTensorDescriptor *arg_desc = get_kernel_argument(id);
+        if(arg_desc != nullptr)
+        {
+            return arg_desc->tensor_info;
+        }
+        return nullptr;
+    }
+
 private:
     void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
     {
@@ -635,6 +684,8 @@ private:
     int32_t _num_components{};
     int32_t _num_complex_components{};
 
+    ArgumentID _dst_id{ -1 };
+
     // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
     std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
     std::unordered_map<ArgumentID, ClTensorDescriptor>  _kernel_arguments{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index a44b5faee2..06c29c4253 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -24,6 +24,9 @@
 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -41,6 +44,26 @@ std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
     return std::set<std::string> { "gemm_helpers.h", "repeat.h" };
 }
 
+Window ClElementwiseAddKernelComponent::get_window() const
+{
+    const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+    ITensorInfo       *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
+
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info);
+    const TensorShape &out_shape = broadcast_pair.first;
+
+    auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type());
+
+    const unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
+    Window             win                               = calculate_max_window(*dst_info, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
 std::string ClElementwiseAddKernelComponent::get_component_code() const
 {
     std::string code;
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index c0de4ac9b8..fe5f964c54 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -37,13 +37,15 @@ namespace dynamic_fusion
 class ClElementwiseAddKernelComponent : public IClKernelComponent
 {
 public:
-    ClElementwiseAddKernelComponent(const Link &lhs, const Link &rhs, const Link &dst)
-        : _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
+    ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
+        : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
     {
     }
+
     ComponentType         get_component_type() const override;
     std::set<std::string> get_headers_list() const override;
     std::string           get_component_code() const override;
+    Window                get_window() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index 1521973d55..e70e5d5ea5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -24,6 +24,9 @@
 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -41,6 +44,92 @@ std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const
     return std::set<std::string> { "./common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" };
 }
 
+Window ClGemmNativeKernelComponent::get_window() const
+{
+    ITensorInfo *lhs_info  = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    ITensorInfo *rhs_info  = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+    ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
+    ITensorInfo *dst_info  = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
+
+    bool reinterpret_input_as_3d  = _desc.reinterpret_input_as_3d;
+    bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0;
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation
+    GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d,
+                             _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width,
+                             _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info)));
+
+    TensorInfo tmp_info(*dst_info);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst_info->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    win     = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
+    win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
+
+    AccessWindowStatic src0_access(lhs_info, 0, 0,
+                                   lhs_info->dimension(0),
+                                   lhs_info->dimension(1));
+    AccessWindowStatic src1_access(rhs_info, 0, 0,
+                                   ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0),
+                                   rhs_info->dimension(1));
+    AccessWindowStatic dst_access(dst_info, 0, 0,
+                                  dst_info->dimension(0),
+                                  dst_info->dimension(1));
+
+    if(bias_info != nullptr)
+    {
+        const int bias_processed_per_iteration_x = _desc.rhs_info.n0;
+
+        AccessWindowStatic src2_access(bias_info, 0, 0,
+                                       ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x),
+                                       bias_info->dimension(1));
+
+        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    if(window_changed == true)
+    {
+        ARM_COMPUTE_ERROR("Insufficient Padding!");
+    }
+
+    return collapsed;
+}
+
 std::string ClGemmNativeKernelComponent::get_additional_macros() const
 {
     return R"_(
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 38f007c07c..09933a8932 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -26,7 +26,10 @@
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
 
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
@@ -37,14 +40,17 @@ namespace dynamic_fusion
 class ClGemmNativeKernelComponent : public IClKernelComponent
 {
 public:
-    ClGemmNativeKernelComponent(const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
-        : _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
+    ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc,
+                                const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
+        : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
     {
     }
+
     ComponentType         get_component_type() const override;
     std::set<std::string> get_headers_list() const override;
     std::string           get_additional_macros() const override;
     std::string           get_component_code() const override;
+    Window                get_window() const override;
     ClKernelArgList       get_args();
 
     virtual std::vector<Link> get_links() const override
@@ -60,10 +66,11 @@ public:
     }
 
 private:
-    Link _lhs{};
-    Link _rhs{};
-    Link _bias{};
-    Link _dst{};
+    GemmNativeDescriptor _desc{};
+    Link                 _lhs{};
+    Link                 _rhs{};
+    Link                 _bias{};
+    Link                 _dst{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index f0d01d30a9..ad7a207ef8 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -37,8 +37,8 @@ namespace dynamic_fusion
 class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
 {
 public:
-    ClStoreBlockBoundaryAwareKernelComponent(const Link &src, const Link &dst)
-        : _src{ src }, _dst{ dst }
+    ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+        : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
     ComponentType get_component_type() const override;
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index c4e7033914..753e0a4625 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
 
 #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
@@ -77,85 +76,6 @@ void fill(U &&tensor, int seed)
     library->fill_borders_with_garbage(tensor, distribution_inf, seed);
 }
 
-using ElementsProcessed = Steps;
-std::pair<Status, Window> mock_gemm_native_validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                                         const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       src2->dimension(1));
-
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-
 void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc,
                        const TensorInfo &t_lhs_info,
                        const TensorInfo &t_rhs_info,
@@ -241,7 +161,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
     const auto t_dst_shape = TensorShape(n, m);
     auto       t_lhs_info  = TensorInfo(t_lhs_shape, 1, data_type);
     auto       t_rhs_info  = TensorInfo(t_rhs_shape, 1, data_type);
-    const auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
+    auto       t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
     auto       t_dst_info  = TensorInfo(t_dst_shape, 1, data_type);
 
     const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
@@ -270,7 +190,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
     ArgumentID tid_acc;
     st = add_tensor_intermed(bp, tid_acc);
     st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
-
     st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
     st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
 
@@ -278,13 +197,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
 
     st = set_tile_info(bp, store_tile_info);
     st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
-
     set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
-    ElementsProcessed num_elements_processed{};
-    auto              win_config = mock_gemm_native_validate_and_configure_window(&t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, gemm_info,
-                                                                                  num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    cl_code.window = win_config.second;
 
     ClExecutionDescriptor exec_desc;
     st = tune_static(exec_desc, cl_code);
@@ -432,11 +345,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
         st = set_tile_info(bp, store_tile_info);
         st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
         set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
-        ElementsProcessed num_elements_processed{};
-        auto              win_config = mock_gemm_native_validate_and_configure_window(&t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, gemm_info,
-                                                                                      num_elements_processed);
-        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        cl_code.window = win_config.second;
         TOCK(cond0_build_time, measurements)
 
         TICK(cond0_tune_time)