From 93b75e0c072c3cc5654fcdf6aed1068b40012081 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 21 Jun 2021 12:00:43 +0100 Subject: Port NEGEMM to memory injecting interface (Part 1) - Start porting NEGEMM to the new API - Port NEGEMMInterleave4x4Kernel to the new API - Port NEGEMMMatrixAdditionKernel to the new API - Port NEGEMMTranspose1xWKernel to the new API - Remove padding from NEGEMMMatrixAdditionKernel - Remove unused INESimpleKernel and ICPPSimpleKernel Partially resolves: COMPMID-4402 Change-Id: I63edadddfe00a54586e5384d6a0211db25ae9042 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5857 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/CPP/ICPPSimpleKernel.h | 76 ---------------------- arm_compute/core/Types.h | 4 +- .../runtime/NEON/functions/NEConvolutionLayer.h | 6 +- .../runtime/NEON/functions/NEFullyConnectedLayer.h | 2 +- arm_compute/runtime/NEON/functions/NEGEMM.h | 55 +++------------- .../NEON/functions/NEGEMMConvolutionLayer.h | 4 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 4 +- arm_compute/runtime/NEON/functions/NELSTMLayer.h | 2 +- 8 files changed, 19 insertions(+), 134 deletions(-) delete mode 100644 arm_compute/core/CPP/ICPPSimpleKernel.h (limited to 'arm_compute') diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h deleted file mode 100644 index c31d487a45..0000000000 --- a/arm_compute/core/CPP/ICPPSimpleKernel.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2017-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICPPSIMPLEKERNEL_H -#define ARM_COMPUTE_ICPPSIMPLEKERNEL_H - -#include "arm_compute/core/CPP/ICPPKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for simple C++ kernels having 1 tensor input and 1 tensor output */ -class ICPPSimpleKernel : public ICPPKernel -{ -public: - /** Constructor */ - ICPPSimpleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - ICPPSimpleKernel(const ICPPSimpleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - ICPPSimpleKernel &operator=(const ICPPSimpleKernel &) = delete; - /** Allow instances of this class to be moved */ - ICPPSimpleKernel(ICPPSimpleKernel &&) = default; - /** Allow instances of this class to be moved */ - ICPPSimpleKernel &operator=(ICPPSimpleKernel &&) = default; - /** Default destructor */ - ~ICPPSimpleKernel() = default; - -protected: - /** Configure the kernel - * - * @param[in] input Source tensor. - * @param[out] output Destination tensor. - * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration. - * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. - * @param[in] border_size (Optional) Size of the border. - */ - void configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); - /** Static function to check if given info will lead to a valid configuration of @ref ICPPSimpleKernel. - * - * @param[in] input Source tensor info. - * @param[in] output Destination tensor info. - * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration. - * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. - * @param[in] border_size (Optional) Size of the border. - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration, - bool border_undefined = false, const BorderSize &border_size = BorderSize()); - -protected: - const ITensor *_input; - ITensor *_output; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_ICPPSIMPLEKERNEL_H */ diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 48c87cd8ac..f6658e7544 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1750,10 +1750,10 @@ private: /** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape. * - * The matrix A can only be reshaped through @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel or @ref NEGEMMInterleave4x4Kernel + * The matrix A can only be reshaped through @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel or @ref cpu::kernels::CpuGemmInterleave4x4Kernel * Note: Optionally just for @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block * - * The matrix B can only be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel or @ref NEGEMMTranspose1xWKernel + * The matrix B can only be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel or @ref cpu::kernels::CpuGemmTranspose1xWKernel * Note: Optionally just for @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block * */ diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index f19aa8008b..bb4c456787 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -111,7 +111,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -133,7 +133,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -156,7 +156,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index d2cd60e576..22ec9e0fec 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -80,7 +80,7 @@ private: * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) * -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric) - * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 6c5be0eb5e..c1ae11bcbf 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -29,40 +29,26 @@ #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/helpers/MemoryHelpers.h" #include namespace arm_compute { -// Forward declarations -class NEGEMMInterleave4x4Kernel; -class NEGEMMMatrixAdditionKernel; -class NEGEMMMatrixMultiplyKernel; -class NEGEMMTranspose1xWKernel; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} - /** Basic function to execute GEMM. This function calls the following kernels: * * If optimized assembly is available: * -# @ref cpu::CpuGemmAssemblyDispatch - * -# @ref NEActivationLayer (if alpha != 1.0) + * -# @ref cpu::CpuActivation (if alpha != 1.0) * Else: - * -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix) - * -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix) * -# @ref NEGEMMMatrixMultiplyKernel * In both cases: - * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) * Else: - * -# @ref NEArithmeticAddition (if c != nullptr and is reshaped once and not optimized assembly in place) + * -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place) * - * -# @ref NEActivationLayer (if activation is specified in GEMMInfo) + * -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo) */ class NEGEMM : public IFunction { @@ -117,33 +103,8 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr _interleave_kernel; - std::unique_ptr _transpose_kernel; - std::unique_ptr _mm_kernel; - std::unique_ptr _asm_glue; - std::unique_ptr _ma_kernel; - NEActivationLayer _alpha_scale_func; - NEArithmeticAddition _add_bias; - NEActivationLayer _activation_func; - - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _tmp_d; - const ITensor *_original_b; - bool _run_vector_matrix_multiplication; - bool _run_alpha_scale; - bool _run_addition; - bool _run_bias_addition; - bool _run_activation; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - - ITensorPack _asm_glue_run_pack; - ITensorPack _asm_glue_prep_pack; - WorkspaceData _asm_glue_workspace; - experimental::MemoryRequirements _aux_mem_req; + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMM_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index edb58e956a..d334d518e2 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -203,7 +203,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported @@ -223,7 +223,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index ff888760e1..60cfd8f91d 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -35,8 +35,8 @@ namespace arm_compute class ITensor; /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: * - * -# @ref NEGEMMInterleave4x4Kernel - * -# @ref NEGEMMTranspose1xWKernel + * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel + * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel * -# @ref NEGEMMLowpMatrixMultiplyKernel * -# @ref NEGEMMLowpOffsetContributionKernel * -# @ref NEActivationLayer diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index 075fb4530a..4272215486 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_NELSTMLAYER_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -- cgit v1.2.1