Setup automatic kernel window for dynamic fusion

The window of the fused kernels is equal to the root kernel's window. For this reason, the following kernels does not change the window and intermediate tensors have the same shape and info as the destination tensor. Resolves: COMPMID-5152 Change-Id: I25fe2fab8304ecaabfc2e4ade9bbf31a600a5033 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7316 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2022-03-10 21:21:01 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2022-03-17 17:41:21 +0000
commit: 8a87983c90299dfc7d6fbda3dba312e7603d7074 (patch)
tree: ad1299cd902e6b2e5662f3f6e1b8fd12835b8469
parent: 193cad36d8ff70792562390b554304cc19284f61 (diff)
download: ComputeLibrary-8a87983c90299dfc7d6fbda3dba312e7603d7074.tar.gz
9 files changed, 198 insertions, 114 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index e40f9c6da9..6db1ca4cf5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
@@ -61,13 +61,15 @@ Status add_tensor_intermed(ClKernelBlueprint &kernel_blueprint, ArgumentID &id)
     return Status{};
 }
 
-Status add_kcomp_gemm_native(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
+Status add_kcomp_gemm_native(ClKernelBlueprint          &kernel_blueprint, const ClKernelComponentDescriptor &,
+                             const GemmNativeDescriptor &gemm_native_desc,
                              ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id)
 {
     kernel_blueprint.impl().validate_arg_ids({ lhs_id, rhs_id, bias_id, dst_id });
-
     kernel_blueprint.impl().add_component(
         std::make_unique<ClGemmNativeKernelComponent>(
+            &kernel_blueprint,
+            gemm_native_desc,
             SharedVarLink{ lhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(lhs_id) },
             SharedVarLink{ rhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(rhs_id) },
             SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
@@ -81,6 +83,7 @@ Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClKernel
 {
     kernel_blueprint.impl().add_component(
         std::make_unique<ClElementwiseAddKernelComponent>(
+            &kernel_blueprint,
             SharedVarLink{ src0_id, SharedVarIO::Input, kernel_blueprint.impl().group(src0_id) },
             SharedVarLink{ src1_id, SharedVarIO::Input, kernel_blueprint.impl().group(src1_id) },
             SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }));
@@ -98,6 +101,7 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon
         case StoreType::StoreBlockBoundaryAware:
             kernel_blueprint.impl().add_component(
                 std::make_unique<ClStoreBlockBoundaryAwareKernelComponent>(
+                    &kernel_blueprint,
                     SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
                     SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
             break;
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 15622c848d..27ab294cc9 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
@@ -100,13 +100,13 @@ struct ClKernelComponentDescriptor
 /** Component: Tensor Argument */
 struct ClTensorDescriptor
 {
-    ClTensorDescriptor(const ITensorInfo *info, unsigned int dim)
+    ClTensorDescriptor(ITensorInfo *info, unsigned int dim)
         : tensor_info(info), slice_dim(dim)
     {
     }
 
-    const ITensorInfo *tensor_info;
-    unsigned int       slice_dim;
+    ITensorInfo *tensor_info;
+    unsigned int slice_dim;
 };
 
 Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &);
@@ -133,8 +133,8 @@ struct GemmNativeDescriptor
     int32_t           b_offset{};
 };
 
-Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, ArgumentID input_id,
-                             ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id);
+Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
+                             ArgumentID input_id, ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id);
 
 /** Component: Eltwise Add */
 struct EltwiseAddDescriptor
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index 3b5160a055..b285cc2b54 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/common/Macros.h"
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
 
@@ -63,8 +64,8 @@ enum class SharedVarGroup
     Automatic // Automatic variables declared within the kernel body
 };
 
-/** Specifies a shared variable ink for a component.
- * It describes all the information that's availbale when a component is constructed / added:
+/** Specifies a shared variable link for a component.
+ * It describes all the information that's available when a component is constructed / added:
  *  e.g. its linkage (via ArgumentID and io) and its group
  * This is not shared variable on its own, but is used for instantiating a SharedVar when building the code
  */
@@ -204,6 +205,13 @@ public:
     };
     using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
 public:
+    IClKernelComponent(const ClKernelBlueprint *blueprint)
+        : _blueprint(blueprint)
+    {
+    }
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClKernelComponent);
+
     virtual ~IClKernelComponent()                        = default;
     virtual ComponentType     get_component_type() const = 0;
     virtual std::vector<Link> get_links() const          = 0;
@@ -278,6 +286,11 @@ public:
     {
         return "";
     }
+
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
     /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code
      *
      * @param vtable
@@ -290,6 +303,9 @@ public:
         return "";
     }
 
+protected:
+    const ClKernelBlueprint *_blueprint;
+
 private:
     ComponentID _id{};
 };
@@ -398,6 +414,12 @@ public:
             // Additionally, set this component as one that treats this argument as "Output" (append to index 1)
             else
             {
+                if(component->get_component_type() == ComponentType::Store)
+                {
+                    ARM_COMPUTE_ERROR_ON_MSG(_dst_id >= 0, "Trying to add more than one dst argument to the graph");
+                    _dst_id = arg_id;
+                }
+
                 for(const auto &subseq_component : _outgoing_components[arg_id])
                 {
                     _component_graph[component_id].push_back(subseq_component);
@@ -430,7 +452,6 @@ public:
             stack.pop();
         }
 
-        std::cout << name << std::endl;
         return name;
     }
 
@@ -508,7 +529,15 @@ public:
 
     Window get_execution_window() const
     {
-        return Window{};
+        ARM_COMPUTE_ERROR_ON_MSG(_graph_root < 0, "No root found in the component graph");
+        ARM_COMPUTE_ERROR_ON_MSG(_dst_id == -1, "Destination Tensor Id should be ready before calling get_execution_window()");
+
+        return _components.find(_graph_root)->second->get_window();
+    }
+
+    ArgumentID get_dst_id() const
+    {
+        return _dst_id;
     }
 
     ClKernelArgList get_arguments() const
@@ -521,6 +550,26 @@ public:
         return arg_list;
     }
 
+    const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const
+    {
+        auto it = _kernel_arguments.find(id);
+        if(it != _kernel_arguments.end())
+        {
+            return &_kernel_arguments.find(id)->second;
+        }
+        return nullptr;
+    }
+
+    ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+    {
+        const ClTensorDescriptor *arg_desc = get_kernel_argument(id);
+        if(arg_desc != nullptr)
+        {
+            return arg_desc->tensor_info;
+        }
+        return nullptr;
+    }
+
 private:
     void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
     {
@@ -635,6 +684,8 @@ private:
     int32_t _num_components{};
     int32_t _num_complex_components{};
 
+    ArgumentID _dst_id{ -1 };
+
     // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
     std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
     std::unordered_map<ArgumentID, ClTensorDescriptor>  _kernel_arguments{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index a44b5faee2..06c29c4253 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -24,6 +24,9 @@
 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -41,6 +44,26 @@ std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
     return std::set<std::string> { "gemm_helpers.h", "repeat.h" };
 }
 
+Window ClElementwiseAddKernelComponent::get_window() const
+{
+    const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+    ITensorInfo       *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
+
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info);
+    const TensorShape &out_shape = broadcast_pair.first;
+
+    auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type());
+
+    const unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
+    Window             win                               = calculate_max_window(*dst_info, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
 std::string ClElementwiseAddKernelComponent::get_component_code() const
 {
     std::string code;
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index c0de4ac9b8..fe5f964c54 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -37,13 +37,15 @@ namespace dynamic_fusion
 class ClElementwiseAddKernelComponent : public IClKernelComponent
 {
 public:
-    ClElementwiseAddKernelComponent(const Link &lhs, const Link &rhs, const Link &dst)
-        : _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
+    ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
+        : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
     {
     }
+
     ComponentType         get_component_type() const override;
     std::set<std::string> get_headers_list() const override;
     std::string           get_component_code() const override;
+    Window                get_window() const override;
 
     virtual std::vector<Link> get_links() const override
     {
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index 1521973d55..e70e5d5ea5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -24,6 +24,9 @@
 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -41,6 +44,92 @@ std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const
     return std::set<std::string> { "./common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" };
 }
 
+Window ClGemmNativeKernelComponent::get_window() const
+{
+    ITensorInfo *lhs_info  = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+    ITensorInfo *rhs_info  = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+    ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
+    ITensorInfo *dst_info  = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
+
+    bool reinterpret_input_as_3d  = _desc.reinterpret_input_as_3d;
+    bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0;
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation
+    GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d,
+                             _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width,
+                             _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info)));
+
+    TensorInfo tmp_info(*dst_info);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst_info->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    win     = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
+    win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
+
+    AccessWindowStatic src0_access(lhs_info, 0, 0,
+                                   lhs_info->dimension(0),
+                                   lhs_info->dimension(1));
+    AccessWindowStatic src1_access(rhs_info, 0, 0,
+                                   ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0),
+                                   rhs_info->dimension(1));
+    AccessWindowStatic dst_access(dst_info, 0, 0,
+                                  dst_info->dimension(0),
+                                  dst_info->dimension(1));
+
+    if(bias_info != nullptr)
+    {
+        const int bias_processed_per_iteration_x = _desc.rhs_info.n0;
+
+        AccessWindowStatic src2_access(bias_info, 0, 0,
+                                       ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x),
+                                       bias_info->dimension(1));
+
+        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    if(window_changed == true)
+    {
+        ARM_COMPUTE_ERROR("Insufficient Padding!");
+    }
+
+    return collapsed;
+}
+
 std::string ClGemmNativeKernelComponent::get_additional_macros() const
 {
     return R"_(
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 38f007c07c..09933a8932 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -26,7 +26,10 @@
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
 
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
@@ -37,14 +40,17 @@ namespace dynamic_fusion
 class ClGemmNativeKernelComponent : public IClKernelComponent
 {
 public:
-    ClGemmNativeKernelComponent(const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
-        : _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
+    ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc,
+                                const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
+        : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
     {
     }
+
     ComponentType         get_component_type() const override;
     std::set<std::string> get_headers_list() const override;
     std::string           get_additional_macros() const override;
     std::string           get_component_code() const override;
+    Window                get_window() const override;
     ClKernelArgList       get_args();
 
     virtual std::vector<Link> get_links() const override
@@ -60,10 +66,11 @@ public:
     }
 
 private:
-    Link _lhs{};
-    Link _rhs{};
-    Link _bias{};
-    Link _dst{};
+    GemmNativeDescriptor _desc{};
+    Link                 _lhs{};
+    Link                 _rhs{};
+    Link                 _bias{};
+    Link                 _dst{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index f0d01d30a9..ad7a207ef8 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -37,8 +37,8 @@ namespace dynamic_fusion
 class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
 {
 public:
-    ClStoreBlockBoundaryAwareKernelComponent(const Link &src, const Link &dst)
-        : _src{ src }, _dst{ dst }
+    ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+        : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
     ComponentType get_component_type() const override;
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index c4e7033914..753e0a4625 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
 
 #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
@@ -77,85 +76,6 @@ void fill(U &&tensor, int seed)
     library->fill_borders_with_garbage(tensor, distribution_inf, seed);
 }
 
-using ElementsProcessed = Steps;
-std::pair<Status, Window> mock_gemm_native_validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                                         const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       src2->dimension(1));
-
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-
 void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc,
                        const TensorInfo &t_lhs_info,
                        const TensorInfo &t_rhs_info,
@@ -241,7 +161,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
     const auto t_dst_shape = TensorShape(n, m);
     auto       t_lhs_info  = TensorInfo(t_lhs_shape, 1, data_type);
     auto       t_rhs_info  = TensorInfo(t_rhs_shape, 1, data_type);
-    const auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
+    auto       t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
     auto       t_dst_info  = TensorInfo(t_dst_shape, 1, data_type);
 
     const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
@@ -270,7 +190,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
     ArgumentID tid_acc;
     st = add_tensor_intermed(bp, tid_acc);
     st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
-
     st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
     st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
 
@@ -278,13 +197,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
 
     st = set_tile_info(bp, store_tile_info);
     st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
-
     set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
-    ElementsProcessed num_elements_processed{};
-    auto              win_config = mock_gemm_native_validate_and_configure_window(&t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, gemm_info,
-                                                                                  num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    cl_code.window = win_config.second;
 
     ClExecutionDescriptor exec_desc;
     st = tune_static(exec_desc, cl_code);
@@ -432,11 +345,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
         st = set_tile_info(bp, store_tile_info);
         st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
         set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info);
-        ElementsProcessed num_elements_processed{};
-        auto              win_config = mock_gemm_native_validate_and_configure_window(&t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, gemm_info,
-                                                                                      num_elements_processed);
-        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        cl_code.window = win_config.second;
         TOCK(cond0_build_time, measurements)
 
         TICK(cond0_tune_time)
author	Gunes Bayir <gunes.bayir@arm.com>	2022-03-10 21:21:01 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2022-03-17 17:41:21 +0000
commit	8a87983c90299dfc7d6fbda3dba312e7603d7074 (patch)
tree	ad1299cd902e6b2e5662f3f6e1b8fd12835b8469
parent	193cad36d8ff70792562390b554304cc19284f61 (diff)
download	ComputeLibrary-8a87983c90299dfc7d6fbda3dba312e7603d7074.tar.gz