From 30271c779c36a2abe6995c4454674d92bbc1f91f Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 24 Jun 2019 14:56:34 +0100 Subject: COMPMID-2156: Optimized dilated convolution for NEON. Change-Id: I3a8abe8cc9637c8983d9bd69dcbaee1a15eac8d0 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/1492 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez --- .../NEON/functions/NEDepthwiseConvolutionLayer.h | 119 +++++++++++++++++++++ .../NEDepthwiseConvolutionAssemblyDispatch.h | 39 ++++--- 2 files changed, 141 insertions(+), 17 deletions(-) (limited to 'arm_compute/runtime') diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 396e2368c3..81bf53ace6 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -42,6 +42,7 @@ namespace arm_compute { +// Forward declarations class ITensor; /** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following NEON kernels: @@ -157,6 +158,124 @@ private: bool _is_prepared; }; +/** Basic function to execute optimized depthwise convolution routines. This function calls the following NEON kernels: + * + * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported + * + * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present + * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present + * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present + * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required + * -# @ref NEActivationLayer if fused activation is required + * + */ +class NEDepthwiseConvolutionLayerOptimized : public IFunction +{ +public: + /** Default constructor */ + NEDepthwiseConvolutionLayerOptimized(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayerOptimized(const NEDepthwiseConvolutionLayerOptimized &) = delete; + /** Default move constructor */ + NEDepthwiseConvolutionLayerOptimized(NEDepthwiseConvolutionLayerOptimized &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayerOptimized &operator=(const NEDepthwiseConvolutionLayerOptimized &) = delete; + /** Default move assignment operator */ + NEDepthwiseConvolutionLayerOptimized &operator=(NEDepthwiseConvolutionLayerOptimized &&) = default; + /** Initialize the function's source, destination, kernels and border_size. + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + + /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[in] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + + // Inherited methods overriden: + void run() override; + void prepare() override; + +private: + /** Configure the kernels/functions for the generic pipeline. + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * + */ + void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U)); + /** Configure the kernels/functions for the optimized pipeline. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info Activation layer information in case of a fused activation. + */ + void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U)); + /** Run generic kernel */ + void run_generic(); + /** Run optimized function */ + void run_optimized(); + +private: + MemoryGroup _memory_group; + NEDepthwiseConvolutionLayer3x3Kernel _dwc_kernel; + NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func; + NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel; + NEFillBorderKernel _border_handler; + NEPermute _permute_input; + NEPermute _permute_weights; + NEPermute _permute_output; + NEActivationLayer _activationlayer_function; + Tensor _accumulator; + Tensor _permuted_input; + Tensor _permuted_weights; + Tensor _permuted_output; + const ITensor *_original_weights; + bool _has_bias; + bool _is_quantized; + bool _is_optimized; + bool _is_nchw; + bool _permute; + bool _is_activationlayer_enabled; + bool _is_prepared; +}; + /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels: * * -# @ref NEDepthwiseIm2ColKernel diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h index 7d2cff7315..b88e750fa9 100644 --- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h +++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h @@ -30,9 +30,6 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" -#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" - namespace arm_compute { /** Depthwise convolution assembly kernel glue */ @@ -52,38 +49,44 @@ public: NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete; /** Default move assignment operator */ NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default; + /** Default destructor */ + ~NEDepthwiseConvolutionAssemblyDispatch(); /** Initialize the function's source, destination, kernels and border_size. * * @note Supports only NHWC format * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1, 1)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch * * @note Supports only NHWC format * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * * @return An error status */ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1, 1)); /** Check if the optimized kernel can be used for the given kernel sizes and strides * * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC @@ -103,16 +106,18 @@ public: void prepare() override; private: - MemoryGroup _memory_group; - const ITensor *_input; - const ITensor *_weights; - const ITensor *_bias; - ITensor *_output; - Tensor _packed_weights; - Tensor _workspace; - bool _is_prepared; - std::unique_ptr _dwc_assembly_kernel; - NEDepthwiseConvolutionAssemblyKernelWrapper _dwc_acl_kernel; + struct LocalImpl; + +private: + MemoryGroup _memory_group; + const ITensor *_input; + const ITensor *_weights; + const ITensor *_bias; + ITensor *_output; + Tensor _packed_weights; + Tensor _workspace; + bool _is_prepared; + std::unique_ptr _pImpl; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H__ */ -- cgit v1.2.1