From 93b75e0c072c3cc5654fcdf6aed1068b40012081 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 21 Jun 2021 12:00:43 +0100 Subject: Port NEGEMM to memory injecting interface (Part 1) - Start porting NEGEMM to the new API - Port NEGEMMInterleave4x4Kernel to the new API - Port NEGEMMMatrixAdditionKernel to the new API - Port NEGEMMTranspose1xWKernel to the new API - Remove padding from NEGEMMMatrixAdditionKernel - Remove unused INESimpleKernel and ICPPSimpleKernel Partially resolves: COMPMID-4402 Change-Id: I63edadddfe00a54586e5384d6a0211db25ae9042 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5857 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../runtime/NEON/functions/NEConvolutionLayer.h | 6 +-- .../runtime/NEON/functions/NEFullyConnectedLayer.h | 2 +- arm_compute/runtime/NEON/functions/NEGEMM.h | 55 ++++------------------ .../NEON/functions/NEGEMMConvolutionLayer.h | 4 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 4 +- arm_compute/runtime/NEON/functions/NELSTMLayer.h | 2 +- 6 files changed, 17 insertions(+), 56 deletions(-) (limited to 'arm_compute/runtime/NEON') diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index f19aa8008b..bb4c456787 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -111,7 +111,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -133,7 +133,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -156,7 +156,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index d2cd60e576..22ec9e0fec 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -80,7 +80,7 @@ private: * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) * -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric) - * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 6c5be0eb5e..c1ae11bcbf 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -29,40 +29,26 @@ #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/helpers/MemoryHelpers.h" #include namespace arm_compute { -// Forward declarations -class NEGEMMInterleave4x4Kernel; -class NEGEMMMatrixAdditionKernel; -class NEGEMMMatrixMultiplyKernel; -class NEGEMMTranspose1xWKernel; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} - /** Basic function to execute GEMM. This function calls the following kernels: * * If optimized assembly is available: * -# @ref cpu::CpuGemmAssemblyDispatch - * -# @ref NEActivationLayer (if alpha != 1.0) + * -# @ref cpu::CpuActivation (if alpha != 1.0) * Else: - * -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix) - * -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix) * -# @ref NEGEMMMatrixMultiplyKernel * In both cases: - * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) * Else: - * -# @ref NEArithmeticAddition (if c != nullptr and is reshaped once and not optimized assembly in place) + * -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place) * - * -# @ref NEActivationLayer (if activation is specified in GEMMInfo) + * -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo) */ class NEGEMM : public IFunction { @@ -117,33 +103,8 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr _interleave_kernel; - std::unique_ptr _transpose_kernel; - std::unique_ptr _mm_kernel; - std::unique_ptr _asm_glue; - std::unique_ptr _ma_kernel; - NEActivationLayer _alpha_scale_func; - NEArithmeticAddition _add_bias; - NEActivationLayer _activation_func; - - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _tmp_d; - const ITensor *_original_b; - bool _run_vector_matrix_multiplication; - bool _run_alpha_scale; - bool _run_addition; - bool _run_bias_addition; - bool _run_activation; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - - ITensorPack _asm_glue_run_pack; - ITensorPack _asm_glue_prep_pack; - WorkspaceData _asm_glue_workspace; - experimental::MemoryRequirements _aux_mem_req; + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMM_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index edb58e956a..d334d518e2 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -203,7 +203,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported @@ -223,7 +223,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index ff888760e1..60cfd8f91d 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -35,8 +35,8 @@ namespace arm_compute class ITensor; /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: * - * -# @ref NEGEMMInterleave4x4Kernel - * -# @ref NEGEMMTranspose1xWKernel + * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel + * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel * -# @ref NEGEMMLowpMatrixMultiplyKernel * -# @ref NEGEMMLowpOffsetContributionKernel * -# @ref NEActivationLayer diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index 075fb4530a..4272215486 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_NELSTMLAYER_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -- cgit v1.2.1