From f01201abec0a102f6e7a517971f83fef1eaffd50 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Wed, 6 Nov 2019 14:57:49 +0000 Subject: COMPMID-2305: NEDepthwiseConvolution 3x3: support for QUANT8_PER_CHANNEL_SYMM Change-Id: I9a917cff6a089ce6ae16fb4e6066a4194e2e9487 Signed-off-by: Giuseppe Rossini Reviewed-on: https://review.mlplatform.org/c/2241 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Reviewed-by: Pablo Marquez --- .../convolution/depthwise/depthwise_quantized.hpp | 156 +++++++++++++++++++++ 1 file changed, 156 insertions(+) (limited to 'arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp') diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp index f8db4db6cc..ef3adc4c0c 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp @@ -25,6 +25,68 @@ #pragma once #include "depthwise.hpp" #include "qasymm8.hpp" +#include "qsymm8.hpp" +#pragma once + +using namespace neon_convolution_kernels; +using namespace qasymm8; + +template +inline T saturating_doubling_high_mul(const T&, const U&); + +template <> +inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b) +{ + return vqrdmulhq_s32(a, b); +} + +template <> +inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b) +{ + return vqrdmulhq_n_s32(a, b); +} + +template <> +inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b) +{ + return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); +} + +template +inline T rounding_divide_by_exp2(const T& x, const U exponent); + +template <> +inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift) +{ + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +template <> +inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +template <> +inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent) +{ + const int32x2_t shift = vdup_n_s32(-exponent); + const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); + const int32x2_t fixed = vqadd_s32(x, fixup); + return vrshl_s32(fixed, shift); +} + +template <> +inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent) +{ + const int32x2_t xs = vdup_n_s32(x); + return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); +} namespace depthwise { @@ -145,4 +207,98 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase< const qasymm8::QAsymm8RescaleParams rescale_parameters; }; +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t, uint8_t, + QSymm8HybridPerChannelDepthwiseConvolution +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t, uint8_t, + QSymm8HybridPerChannelDepthwiseConvolution + >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + QSymm8HybridPerChannelDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + const qsymm8::QSymm8PerChannelParams& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + QSymm8HybridPerChannelDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + const qsymm8::QSymm8PerChannelParams& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + size_t get_packed_params_size(void) const override + { + return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t)); + + } + + protected: + uint8_t _input_padding_value(void) const; + + void _pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const; + + template + void execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + uint8_t* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); + + template + void execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] + ); + + private: + // Quantization parameters + const qsymm8::QSymm8PerChannelParams _weights_quant; + const qasymm8::QAsymm8Params _input_quant, _output_quant; + const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters; +}; + } // namespace depthwise -- cgit v1.2.1