From 881f2ded860fc1db23810076b699c4492556c376 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 12 Apr 2019 10:29:17 +0100 Subject: COMPMID-2048: Add support for dilation in NEDepthwiseConvolution. Change-Id: If9941e770779fbf918ba5ff0573da9378078b969 Signed-off-by: Usama Arif Reviewed-on: https://review.mlplatform.org/c/999 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez --- .../kernels/NEDepthwiseConvolutionLayer3x3Kernel.h | 15 +- .../core/NEON/kernels/NEDepthwiseIm2ColKernel.h | 15 +- .../kernels/detail/NEDirectConvolutionDetail.h | 376 ++++++++++++++++++++- .../NEON/functions/NEDepthwiseConvolutionLayer.h | 12 +- .../NEDepthwiseConvolutionAssemblyDispatch.h | 3 +- 5 files changed, 404 insertions(+), 17 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h index 87ca4da05b..c0381cb8d7 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h @@ -58,21 +58,25 @@ public: * @param[out] output Destination tensor. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * */ - void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1); + void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel * * @note Supported data layouts: NCHW and NHWC * - * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input. - * @param[in] output Destination tensor. Data type supported: Same as @p input. + * @param[in] input Source tensor info. DataType supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1); + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -86,6 +90,7 @@ private: PadStrideInfo _conv_info; unsigned int _num_elems_written_per_iteration; unsigned int _depth_multiplier; + Size2D _dilation; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h index de671361d6..3e123b4839 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -62,23 +62,27 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias Boolean that specifies if the depthwise convolution has bias. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1); + void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1, + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseIm2ColKernel * - * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * @param[in] input The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 - * @param[in] output The output tensor. First 3 lower dimensions represent a transform of each 3D input, + * @param[in] output The output tensor info. First 3 lower dimensions represent a transform of each 3D input, * while every dimension above 3 represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias Boolean that specifies if the depthwise convolution has bias. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1, + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -104,6 +108,7 @@ private: PadStrideInfo _conv_info; bool _has_bias; unsigned int _depth_multiplier; + Size2D _dilation; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index e6dc43a47b..3547d2d110 100644 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -79,6 +79,125 @@ inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0) return r; } +/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + dilation_x), + vld1q_f32(in_top + 2 * dilation_x) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + dilation_x), + vld1q_f32(in_mid + 2 * dilation_x) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + dilation_x), + vld1q_f32(in_low + 2 * dilation_x) + } + }; + float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); + out = vmlaq_f32(out, vtop.val[1], m0.val[1]); + out = vmlaq_f32(out, vtop.val[2], m0.val[2]); + + out = vmlaq_f32(out, vmid.val[0], m1.val[0]); + out = vmlaq_f32(out, vmid.val[1], m1.val[1]); + out = vmlaq_f32(out, vmid.val[2], m1.val[2]); + + out = vmlaq_f32(out, vlow.val[0], m2.val[0]); + out = vmlaq_f32(out, vlow.val[1], m2.val[1]); + out = vmlaq_f32(out, vlow.val[2], m2.val[2]); + + return out; +} + +/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +template +float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset = 0); + +template <> +inline float32x4x2_t convolve_3x3_dilation<1>(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + + const float32x4x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) + } + }; + + return out; +} + +template <> +inline float32x4x2_t convolve_3x3_dilation<2>(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + + float32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3_dilation<3>(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + + float32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset); + ; + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + /** Perform a convolve3x3 on float32. * * @param[in] in_top Pointer to the first row of the input. @@ -183,6 +302,143 @@ inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, c return out; } +/** Perform a 3x3 convolution for 4 consecutive elements on uint8_t when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +inline int32x4_t single_convolve_3x3_dilation(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + size_t dilation_x, int input_offset) +{ + const int32x4_t v_input_offset = vdupq_n_s32(input_offset); + + const uint8x8x3_t vtop = + { + { + vld1_u8(in_top), + vld1_u8(in_top + dilation_x), + vld1_u8(in_top + 2 * dilation_x) + } + }; + const uint8x8x3_t vmid = + { + { + vld1_u8(in_mid), + vld1_u8(in_mid + dilation_x), + vld1_u8(in_mid + 2 * dilation_x) + } + }; + const uint8x8x3_t vlow = + { + { + vld1_u8(in_low), + vld1_u8(in_low + dilation_x), + vld1_u8(in_low + 2 * dilation_x) + } + }; + + const int32x4x3_t vtop_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[0])))), //convert from uint8x8 to uint16x8, to uint16x4(lower or bottom half) to int16x4 to int32x4 + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[1])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vtop.val[2])))), + } + }; + const int32x4x3_t vmid_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[1])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vmid.val[2])))), + } + }; + const int32x4x3_t vlow_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[0])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[1])))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vlow.val[2])))), + } + }; + + int32x4_t out = vmulq_s32(vtop_s32.val[0], m0.val[0]); + out = vmlaq_s32(out, vtop_s32.val[1], m0.val[1]); + out = vmlaq_s32(out, vtop_s32.val[2], m0.val[2]); + + out = vmlaq_s32(out, vmid_s32.val[0], m1.val[0]); + out = vmlaq_s32(out, vmid_s32.val[1], m1.val[1]); + out = vmlaq_s32(out, vmid_s32.val[2], m1.val[2]); + + out = vmlaq_s32(out, vlow_s32.val[0], m2.val[0]); + out = vmlaq_s32(out, vlow_s32.val[1], m2.val[1]); + out = vmlaq_s32(out, vlow_s32.val[2], m2.val[2]); + + return out; +} + +/** Perform a 3x3 convolution for 4 consecutive elements on uint8_t when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template +int32x4x2_t convolve_3x3_dilation(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + const size_t dilation_x, int input_offset); + +template <> +inline int32x4x2_t convolve_3x3_dilation<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + const int32x4x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) + } + }; + return out; +} + +template <> +inline int32x4x2_t convolve_3x3_dilation<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + int32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset); + + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline int32x4x2_t convolve_3x3_dilation<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + int32x4x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset); + out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1); + return out; +} + /** Perform a convolve3x3 on uint8_t * * @param[in] in_top Pointer to the first row of the input. @@ -390,6 +646,124 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = return r; } +/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional)Input quantization offset. + * + */ +inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset = 0) +{ + ARM_COMPUTE_UNUSED(input_offset); + const float16x8x3_t vtop = + { + { + vld1q_f16(in_top), + vld1q_f16(in_top + dilation_x), + vld1q_f16(in_top + 2 * dilation_x) + } + }; + const float16x8x3_t vmid = + { + { + vld1q_f16(in_mid), + vld1q_f16(in_mid + dilation_x), + vld1q_f16(in_mid + 2 * dilation_x) + } + }; + const float16x8x3_t vlow = + { + { + vld1q_f16(in_low), + vld1q_f16(in_low + dilation_x), + vld1q_f16(in_low + 2 * dilation_x) + } + }; + float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); + out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); + out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); + + out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0])); + out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1])); + out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2])); + + out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0])); + out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1])); + out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2])); + + return out; +} + +/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional)Input quantization offset. + * + */ +template +float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset = 0); + +template <> +inline float16x8x2_t convolve_3x3_dilation<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset) +{ + const float16x8x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) + } + }; + return out; +} + +template <> +inline float16x8x2_t convolve_3x3_dilation<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + float16x8x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); + return out; +} + +template <> +inline float16x8x2_t convolve_3x3_dilation<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + float16x8x2_t out = convolve_3x3_dilation<1>(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); + return out; +} + /** Perform a convolve3x3 on float16. * * @param[in] in_top Pointer to the first row of the input. diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index c60233664d..396e2368c3 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -73,7 +73,7 @@ public: * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); @@ -88,7 +88,7 @@ public: * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * * @return a status */ @@ -110,9 +110,11 @@ private: * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info Activation layer information in case of a fused activation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * */ void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info); + unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U)); /** Configure the kernels/functions for the optimized pipeline. * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). @@ -186,7 +188,7 @@ public: * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); @@ -201,7 +203,7 @@ public: * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * * @return a status */ diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h index df8f29d2c7..7d2cff7315 100644 --- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h +++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h @@ -92,10 +92,11 @@ public: * @param[in] weights Weights tensor info. * @param[in] conv_info Convolution layer metadata. * @param[in] depth_multiplier (Optional) Depth multiplier to be used. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. */ - static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1); + static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1, 1)); // Inherited methods overridden: void run() override; -- cgit v1.2.1