diff options
9 files changed, 198 insertions, 114 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp index e40f9c6da9..6db1ca4cf5 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp @@ -61,13 +61,15 @@ Status add_tensor_intermed(ClKernelBlueprint &kernel_blueprint, ArgumentID &id) return Status{}; } -Status add_kcomp_gemm_native(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, +Status add_kcomp_gemm_native(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, + const GemmNativeDescriptor &gemm_native_desc, ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id) { kernel_blueprint.impl().validate_arg_ids({ lhs_id, rhs_id, bias_id, dst_id }); - kernel_blueprint.impl().add_component( std::make_unique<ClGemmNativeKernelComponent>( + &kernel_blueprint, + gemm_native_desc, SharedVarLink{ lhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(lhs_id) }, SharedVarLink{ rhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(rhs_id) }, SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }, @@ -81,6 +83,7 @@ Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClKernel { kernel_blueprint.impl().add_component( std::make_unique<ClElementwiseAddKernelComponent>( + &kernel_blueprint, SharedVarLink{ src0_id, SharedVarIO::Input, kernel_blueprint.impl().group(src0_id) }, SharedVarLink{ src1_id, SharedVarIO::Input, kernel_blueprint.impl().group(src1_id) }, SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) })); @@ -98,6 +101,7 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon case StoreType::StoreBlockBoundaryAware: kernel_blueprint.impl().add_component( std::make_unique<ClStoreBlockBoundaryAwareKernelComponent>( + &kernel_blueprint, SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) }, SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) })); break; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h index 15622c848d..27ab294cc9 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h @@ -100,13 +100,13 @@ struct ClKernelComponentDescriptor /** Component: Tensor Argument */ struct ClTensorDescriptor { - ClTensorDescriptor(const ITensorInfo *info, unsigned int dim) + ClTensorDescriptor(ITensorInfo *info, unsigned int dim) : tensor_info(info), slice_dim(dim) { } - const ITensorInfo *tensor_info; - unsigned int slice_dim; + ITensorInfo *tensor_info; + unsigned int slice_dim; }; Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &); @@ -133,8 +133,8 @@ struct GemmNativeDescriptor int32_t b_offset{}; }; -Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, ArgumentID input_id, - ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id); +Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, + ArgumentID input_id, ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id); /** Component: Eltwise Add */ struct EltwiseAddDescriptor diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h index 3b5160a055..b285cc2b54 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h @@ -29,6 +29,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/GPUTarget.h" +#include "src/core/common/Macros.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" @@ -63,8 +64,8 @@ enum class SharedVarGroup Automatic // Automatic variables declared within the kernel body }; -/** Specifies a shared variable ink for a component. - * It describes all the information that's availbale when a component is constructed / added: +/** Specifies a shared variable link for a component. + * It describes all the information that's available when a component is constructed / added: * e.g. its linkage (via ArgumentID and io) and its group * This is not shared variable on its own, but is used for instantiating a SharedVar when building the code */ @@ -204,6 +205,13 @@ public: }; using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags public: + IClKernelComponent(const ClKernelBlueprint *blueprint) + : _blueprint(blueprint) + { + } + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClKernelComponent); + virtual ~IClKernelComponent() = default; virtual ComponentType get_component_type() const = 0; virtual std::vector<Link> get_links() const = 0; @@ -278,6 +286,11 @@ public: { return ""; } + + virtual Window get_window() const + { + return Window{}; + } /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code * * @param vtable @@ -290,6 +303,9 @@ public: return ""; } +protected: + const ClKernelBlueprint *_blueprint; + private: ComponentID _id{}; }; @@ -398,6 +414,12 @@ public: // Additionally, set this component as one that treats this argument as "Output" (append to index 1) else { + if(component->get_component_type() == ComponentType::Store) + { + ARM_COMPUTE_ERROR_ON_MSG(_dst_id >= 0, "Trying to add more than one dst argument to the graph"); + _dst_id = arg_id; + } + for(const auto &subseq_component : _outgoing_components[arg_id]) { _component_graph[component_id].push_back(subseq_component); @@ -430,7 +452,6 @@ public: stack.pop(); } - std::cout << name << std::endl; return name; } @@ -508,7 +529,15 @@ public: Window get_execution_window() const { - return Window{}; + ARM_COMPUTE_ERROR_ON_MSG(_graph_root < 0, "No root found in the component graph"); + ARM_COMPUTE_ERROR_ON_MSG(_dst_id == -1, "Destination Tensor Id should be ready before calling get_execution_window()"); + + return _components.find(_graph_root)->second->get_window(); + } + + ArgumentID get_dst_id() const + { + return _dst_id; } ClKernelArgList get_arguments() const @@ -521,6 +550,26 @@ public: return arg_list; } + const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const + { + auto it = _kernel_arguments.find(id); + if(it != _kernel_arguments.end()) + { + return &_kernel_arguments.find(id)->second; + } + return nullptr; + } + + ITensorInfo *get_kernel_argument_info(const ArgumentID id) const + { + const ClTensorDescriptor *arg_desc = get_kernel_argument(id); + if(arg_desc != nullptr) + { + return arg_desc->tensor_info; + } + return nullptr; + } + private: void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const { @@ -635,6 +684,8 @@ private: int32_t _num_components{}; int32_t _num_complex_components{}; + ArgumentID _dst_id{ -1 }; + // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate) std::unordered_map<ComponentID, ComponentUniquePtr> _components{}; std::unordered_map<ArgumentID, ClTensorDescriptor> _kernel_arguments{}; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp index a44b5faee2..06c29c4253 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp @@ -24,6 +24,9 @@ #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h" +#include "arm_compute/core/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" namespace arm_compute { @@ -41,6 +44,26 @@ std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const return std::set<std::string> { "gemm_helpers.h", "repeat.h" }; } +Window ClElementwiseAddKernelComponent::get_window() const +{ + const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); + const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); + ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info); + const TensorShape &out_shape = broadcast_pair.first; + + auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type()); + + const unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0)); + Window win = calculate_max_window(*dst_info, Steps(num_elems_processed_per_iteration)); + + return win; +} + std::string ClElementwiseAddKernelComponent::get_component_code() const { std::string code; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h index c0de4ac9b8..fe5f964c54 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h @@ -37,13 +37,15 @@ namespace dynamic_fusion class ClElementwiseAddKernelComponent : public IClKernelComponent { public: - ClElementwiseAddKernelComponent(const Link &lhs, const Link &rhs, const Link &dst) - : _lhs{ lhs }, _rhs{ rhs }, _dst{ dst } + ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst) + : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst } { } + ComponentType get_component_type() const override; std::set<std::string> get_headers_list() const override; std::string get_component_code() const override; + Window get_window() const override; virtual std::vector<Link> get_links() const override { diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp index 1521973d55..e70e5d5ea5 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp @@ -24,6 +24,9 @@ #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" namespace arm_compute { @@ -41,6 +44,92 @@ std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const return std::set<std::string> { "./common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" }; } +Window ClGemmNativeKernelComponent::get_window() const +{ + ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); + ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); + ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); + ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info); + + bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_output_as_3d = false; + } + + // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation + GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d, + _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width, + _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset); + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info))); + + TensorInfo tmp_info(*dst_info); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst_info->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); + win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); + + AccessWindowStatic src0_access(lhs_info, 0, 0, + lhs_info->dimension(0), + lhs_info->dimension(1)); + AccessWindowStatic src1_access(rhs_info, 0, 0, + ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0), + rhs_info->dimension(1)); + AccessWindowStatic dst_access(dst_info, 0, 0, + dst_info->dimension(0), + dst_info->dimension(1)); + + if(bias_info != nullptr) + { + const int bias_processed_per_iteration_x = _desc.rhs_info.n0; + + AccessWindowStatic src2_access(bias_info, 0, 0, + ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x), + bias_info->dimension(1)); + + window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + else + { + window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + if(window_changed == true) + { + ARM_COMPUTE_ERROR("Insufficient Padding!"); + } + + return collapsed; +} + std::string ClGemmNativeKernelComponent::get_additional_macros() const { return R"_( diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h index 38f007c07c..09933a8932 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h @@ -26,7 +26,10 @@ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H +#include "arm_compute/core/Steps.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" +#include "src/core/helpers/AutoConfiguration.h" namespace arm_compute { @@ -37,14 +40,17 @@ namespace dynamic_fusion class ClGemmNativeKernelComponent : public IClKernelComponent { public: - ClGemmNativeKernelComponent(const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{}) - : _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst } + ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc, + const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{}) + : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst } { } + ComponentType get_component_type() const override; std::set<std::string> get_headers_list() const override; std::string get_additional_macros() const override; std::string get_component_code() const override; + Window get_window() const override; ClKernelArgList get_args(); virtual std::vector<Link> get_links() const override @@ -60,10 +66,11 @@ public: } private: - Link _lhs{}; - Link _rhs{}; - Link _bias{}; - Link _dst{}; + GemmNativeDescriptor _desc{}; + Link _lhs{}; + Link _rhs{}; + Link _bias{}; + Link _dst{}; }; } // namespace dynamic_fusion diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h index f0d01d30a9..ad7a207ef8 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h @@ -37,8 +37,8 @@ namespace dynamic_fusion class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent { public: - ClStoreBlockBoundaryAwareKernelComponent(const Link &src, const Link &dst) - : _src{ src }, _dst{ dst } + ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst) + : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } { } ComponentType get_component_type() const override; diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp index c4e7033914..753e0a4625 100644 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" @@ -77,85 +76,6 @@ void fill(U &&tensor, int seed) library->fill_borders_with_garbage(tensor, distribution_inf, seed); } -using ElementsProcessed = Steps; -std::pair<Status, Window> mock_gemm_native_validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); - - TensorInfo tmp_info(*dst); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic src0_access(src0, 0, 0, - src0->dimension(0), - src0->dimension(1)); - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); - AccessWindowStatic dst_access(dst, 0, 0, - dst->dimension(0), - dst->dimension(1)); - - if(src2 != nullptr) - { - const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), - src2->dimension(1)); - - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - else - { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} - void set_build_options(ClKernelCode &cl_code, GemmNativeDescriptor gemm_native_desc, const TensorInfo &t_lhs_info, const TensorInfo &t_rhs_info, @@ -241,7 +161,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) const auto t_dst_shape = TensorShape(n, m); auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - const auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32); + auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32); auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 }; @@ -270,7 +190,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) ArgumentID tid_acc; st = add_tensor_intermed(bp, tid_acc); st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); @@ -278,13 +197,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) st = set_tile_info(bp, store_tile_info); st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info); - ElementsProcessed num_elements_processed{}; - auto win_config = mock_gemm_native_validate_and_configure_window(&t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, gemm_info, - num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - cl_code.window = win_config.second; ClExecutionDescriptor exec_desc; st = tune_static(exec_desc, cl_code); @@ -432,11 +345,6 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) st = set_tile_info(bp, store_tile_info); st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); set_build_options(cl_code, gemm_native_desc, t_lhs_info, t_rhs_info, nullptr, t_dst_info); - ElementsProcessed num_elements_processed{}; - auto win_config = mock_gemm_native_validate_and_configure_window(&t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, gemm_info, - num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - cl_code.window = win_config.second; TOCK(cond0_build_time, measurements) TICK(cond0_tune_time) |