From fa79fda2c797282de3589aaa69b06e065e8a21e0 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 20 Sep 2022 11:49:23 +0100 Subject: =?UTF-8?q?Optimize=20Neon=E2=84=A2=20Logistic=20Activation=20-=20?= =?UTF-8?q?Use=20a=201d=20execution=20window=20to=20improve=20memory=20acc?= =?UTF-8?q?ess=20pattern.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves: [COMPMID-5465] Signed-off-by: Mohammed Suhail Munshi Change-Id: Ida30669ffa06eb002ca43a6edf15e25a6eaad2f6 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8344 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Benchmark: Arm Jenkins --- src/core/helpers/WindowHelpers.cpp | 55 +++++++++++++++++++++++++++++---- src/core/helpers/WindowHelpers.h | 12 +++++++ src/cpu/kernels/CpuActivationKernel.cpp | 24 +++++++++++--- src/cpu/kernels/CpuActivationKernel.h | 10 ++++++ src/cpu/operators/CpuActivation.cpp | 10 +++++- src/cpu/operators/CpuActivation.h | 5 ++- 6 files changed, 104 insertions(+), 12 deletions(-) diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp index fa152c9f58..a4d46db352 100644 --- a/src/core/helpers/WindowHelpers.cpp +++ b/src/core/helpers/WindowHelpers.cpp @@ -234,15 +234,15 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St std::pair calculate_squashed_or_max_window(const ITensorInfo &src0, const ITensorInfo &src1) { - const auto &shape0 = src0.tensor_shape(); - const auto &shape1 = src1.tensor_shape(); - const auto &strides0 = src0.strides_in_bytes(); - const auto &strides1 = src1.strides_in_bytes(); - const auto num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions()); + const auto &shape0 = src0.tensor_shape(); + const auto &shape1 = src1.tensor_shape(); + const auto &strides0 = src0.strides_in_bytes(); + const auto &strides1 = src1.strides_in_bytes(); + const auto num_dimensions = std::max(src0.num_dimensions(), src1.num_dimensions()); Window win; size_t split_dimension = Window::DimY; - size_t dim = 0; + size_t dim = 0; size_t squashed_bytes = src0.element_size(); @@ -282,4 +282,47 @@ std::pair calculate_squashed_or_max_window(const ITensorInfo &sr return std::make_pair(win, split_dimension); } + +std::pair calculate_squashed_or_max_window(const ITensorInfo &src) +{ + const auto &shape = src.tensor_shape(); + const auto &strides = src.strides_in_bytes(); + const auto num_dimensions = src.num_dimensions(); + + Window win; + size_t split_dimension = Window::DimY; + size_t dim = 0; + size_t squashed_bytes = src.element_size(); + + // Try to squash the low dimensions together. + for(; dim < num_dimensions; ++dim) + { + if(strides[dim] != squashed_bytes) + { + break; + } + squashed_bytes *= shape[dim]; + } + if(dim == num_dimensions) + { + const auto squashed_elements = squashed_bytes / src.element_size(); + split_dimension = Window::DimX; + // The input tensor can be interpreted as 1D array. + win.set(0, Window::Dimension(0, squashed_elements, 1)); + for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim) + { + win.set(dim, Window::Dimension(0, 1, 1)); + } + } + else + { + // Generate the max window. + for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim) + { + win.set(dim, Window::Dimension(0, shape[dim], 1)); + } + } + return std::make_pair(win, split_dimension); +} + } // namespace arm_compute diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h index c9e5a135c0..eccf7f2d18 100644 --- a/src/core/helpers/WindowHelpers.h +++ b/src/core/helpers/WindowHelpers.h @@ -176,6 +176,18 @@ inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps return calculate_max_enlarged_window(info.valid_region(), steps, border_size); } +/** Calculate the squashed or maximum window for the given tensor shape. + * + * If the tensor data resides continuously in the memory, the tensor can be interpreted + * as 1D array and all the dimensions can be squashed together into the x-dimension. + * Otherwise, generate the max window for the given tensor shape. + * + * @param[in] src Tensor info object defining the shape of the input tensor. + * + * @return The maximum window the kernel can be executed on and the preferred split dimension. + */ +std::pair calculate_squashed_or_max_window(const ITensorInfo &src); + /** Calculate the squashed or maximum window for the given tensor shapes. * * If the tensor data resides continuously in the memory, the tensor can be interpreted diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index 61efcb2dd6..f1e485883c 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -182,10 +182,16 @@ std::pair validate_and_configure_window(const ITensorInfo *src, void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info) { + ARM_COMPUTE_UNUSED(dst); ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), activation_info.activation() }); + if(dst != nullptr) + { + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + } ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -200,10 +206,20 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac #endif // __aarch64__ _act_info = activation_info; - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICPPKernel::configure(win_config.second); + Window win; + + if(src->data_layout() != DataLayout::NHWC) + { + // Use squashed window + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src); + ICPPKernel::configure(win); + } + else + { + // Configure kernel window + win = calculate_max_window(*src, Steps()); + ICPPKernel::configure(win); + } } Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h index d856a9357f..fe2d783059 100644 --- a/src/cpu/kernels/CpuActivationKernel.h +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -73,6 +73,15 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension hint. + */ + size_t get_split_dimension_hint() const + { + return _split_dimension; + } + struct ActivationKernel { const char *name; @@ -85,6 +94,7 @@ public: private: ActivationLayerInfo _act_info{}; ActivationKernelPtr _run_method{ nullptr }; + size_t _split_dimension{ Window::DimY }; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp index 3945fa59a5..197e9850b9 100644 --- a/src/cpu/operators/CpuActivation.cpp +++ b/src/cpu/operators/CpuActivation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "src/cpu/operators/CpuActivation.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" @@ -46,6 +47,13 @@ Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *outp return kernels::CpuActivationKernel::validate(input, output, activation_info); } +void CpuActivation::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} + std::tuple CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h index 9b97c9d24f..f1807d5e47 100644 --- a/src/cpu/operators/CpuActivation.h +++ b/src/cpu/operators/CpuActivation.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,6 +48,9 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; }; } // namespace cpu } // namespace arm_compute -- cgit v1.2.1