From f01201abec0a102f6e7a517971f83fef1eaffd50 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Wed, 6 Nov 2019 14:57:49 +0000 Subject: COMPMID-2305: NEDepthwiseConvolution 3x3: support for QUANT8_PER_CHANNEL_SYMM Change-Id: I9a917cff6a089ce6ae16fb4e6066a4194e2e9487 Signed-off-by: Giuseppe Rossini Reviewed-on: https://review.mlplatform.org/c/2241 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Reviewed-by: Pablo Marquez --- .../NEON/kernels/convolution/common/qsymm8.hpp | 76 ++++ .../convolution/depthwise/depthwise_quantized.hpp | 156 +++++++ .../NEON/kernels/convolution/common/qsymm8.cpp | 185 +++++++++ .../convolution/depthwise/depthwise_qs8_qs8.cpp | 31 ++ .../kernels/convolution/depthwise/impl_base.hpp | 3 +- .../kernels/convolution/depthwise/impl_qa8_qa8.hpp | 48 --- .../depthwise/impl_qa8_qs8_per_channel.hpp | 457 +++++++++++++++++++++ .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 9 +- .../NEDepthwiseConvolutionAssemblyDispatch.cpp | 95 ++++- .../validation/NEON/DepthwiseConvolutionLayer.cpp | 27 ++ 10 files changed, 1030 insertions(+), 57 deletions(-) create mode 100644 arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp create mode 100644 src/core/NEON/kernels/convolution/common/qsymm8.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp diff --git a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp new file mode 100644 index 0000000000..41bfbe4d8a --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once +#include +#include +#include "qasymm8.hpp" + + +namespace qsymm8 { + +struct QSymm8Params { + int8_t quantize(float value) const; + float dequantize(int8_t value) const; + + float scale; +}; + +struct QSymm8RescaleParams { + static QSymm8RescaleParams + make_rescale_params(const QSymm8Params &weight_quant, + const QSymm8Params &input_quant, + const QSymm8Params &output_quant); + + QSymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale); + + const int32_t shift, multiplier; + const float rescale; +}; + +struct QSymm8PerChannelParams { + int8_t quantize(float value, float scale) const; + float dequantize(int8_t value, float scale) const; + + std::vector scales; +}; + +struct QSymm8PerChannelRescaleParams { + static QSymm8PerChannelRescaleParams + make_rescale_params(const QSymm8PerChannelParams &weight_quant, + const QSymm8PerChannelParams &input_quant, + const QSymm8PerChannelParams &output_quant); + + static QSymm8PerChannelRescaleParams + make_rescale_params(const QSymm8PerChannelParams &weight_quant, + const qasymm8::QAsymm8Params &input_quant, + const qasymm8::QAsymm8Params &output_quant); + + QSymm8PerChannelRescaleParams(std::vector& shift, std::vector& multiplier, std::vector& rescale); + + std::vector shifts, multipliers; + std::vector rescales; +}; + +} // namespace qsymm8 diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp index f8db4db6cc..ef3adc4c0c 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp @@ -25,6 +25,68 @@ #pragma once #include "depthwise.hpp" #include "qasymm8.hpp" +#include "qsymm8.hpp" +#pragma once + +using namespace neon_convolution_kernels; +using namespace qasymm8; + +template +inline T saturating_doubling_high_mul(const T&, const U&); + +template <> +inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b) +{ + return vqrdmulhq_s32(a, b); +} + +template <> +inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b) +{ + return vqrdmulhq_n_s32(a, b); +} + +template <> +inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b) +{ + return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); +} + +template +inline T rounding_divide_by_exp2(const T& x, const U exponent); + +template <> +inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift) +{ + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +template <> +inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +template <> +inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent) +{ + const int32x2_t shift = vdup_n_s32(-exponent); + const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); + const int32x2_t fixed = vqadd_s32(x, fixup); + return vrshl_s32(fixed, shift); +} + +template <> +inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent) +{ + const int32x2_t xs = vdup_n_s32(x); + return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); +} namespace depthwise { @@ -145,4 +207,98 @@ class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase< const qasymm8::QAsymm8RescaleParams rescale_parameters; }; +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t, uint8_t, + QSymm8HybridPerChannelDepthwiseConvolution +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + uint8_t, int32_t, uint8_t, + QSymm8HybridPerChannelDepthwiseConvolution + >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + QSymm8HybridPerChannelDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + const qsymm8::QSymm8PerChannelParams& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + QSymm8HybridPerChannelDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + const qsymm8::QSymm8PerChannelParams& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + size_t get_packed_params_size(void) const override + { + return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t)); + + } + + protected: + uint8_t _input_padding_value(void) const; + + void _pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const; + + template + void execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + uint8_t* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); + + template + void execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] + ); + + private: + // Quantization parameters + const qsymm8::QSymm8PerChannelParams _weights_quant; + const qasymm8::QAsymm8Params _input_quant, _output_quant; + const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters; +}; + } // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/common/qsymm8.cpp b/src/core/NEON/kernels/convolution/common/qsymm8.cpp new file mode 100644 index 0000000000..e50263acaa --- /dev/null +++ b/src/core/NEON/kernels/convolution/common/qsymm8.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "qsymm8.hpp" + +namespace qsymm8 { +#if(__ANDROID__ || BARE_METAL) +template T round(T val) { return ::round(val); } +template T exp2(T val) { return ::exp2(val); } +template T log2(T val) { return ::log2(val); } +#else /* (__ANDROID__ || BARE_METAL) */ +template T round(T val) { return std::round(val); } +template T exp2(T val) { return std::exp2(val); } +template T log2(T val) { return std::log2(val); } +#endif /* (__ANDROID__ || BARE_METAL) */ + +// Symmetric quantization +int8_t QSymm8Params::quantize(float value) const +{ + const float transformed = value / scale; + return static_cast(round(std::max(-128.0f, std::min(127.0f, transformed)))); +} + +float QSymm8Params::dequantize(const int8_t value) const +{ + return scale * (static_cast(value)); +} + +QSymm8RescaleParams QSymm8RescaleParams::make_rescale_params( + const QSymm8Params& weight_quant, + const QSymm8Params& input_quant, + const QSymm8Params& output_quant +) +{ + // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc + const float rescale = weight_quant.scale * input_quant.scale / output_quant.scale; + const float shiftf = round(log2(0.5f / rescale)); + const float multf = exp2(31.0f + shiftf)*rescale; + + int64_t shift = static_cast(shiftf); + int64_t mult = static_cast(multf); + + if (mult == (1ll << 31)) + { + mult /= 2; + shift--; + } + + assert(shift >= 0); + assert(mult <= std::numeric_limits::max()); + + return QSymm8RescaleParams( + static_cast(shift), + static_cast(mult), + rescale + ); +} + +QSymm8RescaleParams::QSymm8RescaleParams(int32_t shift, int32_t multi, float rescale) + : shift(shift), multiplier(multi), rescale(rescale) +{ +} + +// Symmetric per-channel quantization +int8_t QSymm8PerChannelParams::quantize(float value, float scale) const +{ + const float transformed = value / scale; + return static_cast(round(std::max(-128.0f, std::min(127.0f, transformed)))); +} + +float QSymm8PerChannelParams::dequantize(const int8_t value, float scale) const +{ + return scale * (static_cast(value)); +} + +QSymm8PerChannelRescaleParams QSymm8PerChannelRescaleParams::make_rescale_params( + const QSymm8PerChannelParams& weight_quant, + const QSymm8PerChannelParams& input_quant, + const QSymm8PerChannelParams& output_quant +) +{ + std::vector shifts; + std::vector mults; + std::vector rescales; + + for(size_t s = 0; s< input_quant.scales.size(); s++) + { + // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc + const float rescale = weight_quant.scales[s] * input_quant.scales[s] / output_quant.scales[s]; + const float shiftf = round(log2(0.5f / rescale)); + const float multf = exp2(31.0f + shiftf)*rescale; + + int64_t shift = static_cast(shiftf); + int64_t mult = static_cast(multf); + + if (mult == (1ll << 31)) + { + mult /= 2; + shift--; + } + + assert(shift >= 0); + assert(mult <= std::numeric_limits::max()); + + shifts.push_back(static_cast(shift)); + mults.push_back(static_cast(mult)); + rescales.push_back(rescale); + } + + return QSymm8PerChannelRescaleParams(shifts, mults, rescales); + +} + +QSymm8PerChannelRescaleParams QSymm8PerChannelRescaleParams::make_rescale_params( + const QSymm8PerChannelParams& weight_quant, + const qasymm8::QAsymm8Params& input_quant, + const qasymm8::QAsymm8Params& output_quant +) +{ + std::vector shifts; + std::vector mults; + std::vector rescales; + + for(size_t s = 0; s< weight_quant.scales.size(); s++) + { + // Based on the gemmlowp approach: https://github.com/google/gemmlowp/blob/master/doc/quantization_example.cc + const float rescale = weight_quant.scales[s] * input_quant.scale / output_quant.scale; + const float shiftf = round(log2(0.5f / rescale)); + const float multf = exp2(31.0f + shiftf)*rescale; + + int64_t shift = static_cast(shiftf); + int64_t mult = static_cast(multf); + + if (mult == (1ll << 31)) + { + mult /= 2; + shift--; + } + + assert(shift >= 0); + assert(mult <= std::numeric_limits::max()); + + shifts.push_back(static_cast(shift)); + mults.push_back(static_cast(mult)); + rescales.push_back(rescale); + } + + return QSymm8PerChannelRescaleParams(shifts, mults, rescales); + +} + +QSymm8PerChannelRescaleParams::QSymm8PerChannelRescaleParams(std::vector& shifts, std::vector& multipliers, std::vector& rescales) + : shifts(shifts), multipliers(multipliers), rescales(rescales) +{ +} + + +} // namespace qasymm8 diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp new file mode 100644 index 0000000000..88d8e9f112 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "impl_qa8_qs8_per_channel.hpp" + +namespace depthwise { +template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>; +template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>; +template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>; +template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>; +} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp index b102a24250..22231cf019 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp @@ -292,6 +292,7 @@ MEMBERFN(void)::run( // Parallelise over blocks of channels const auto start_channel = CHANNEL_BLOCK * start; const auto stop_channel = std::min(_n_channels, CHANNEL_BLOCK * stop); + const auto params_size_per_channel = this->get_packed_params_size()/_n_channels; // Compute top and bottom padding for input and output const int input_pad_top = _padding_top; @@ -325,7 +326,7 @@ MEMBERFN(void)::run( // Get the offset into the packed parameters const auto params_ptr = static_cast(_packed_parameters) + - start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); + start_channel*params_size_per_channel; // Process the row process_tile_row( diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp index e8f44b6bfd..81eb7b306c 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp @@ -36,54 +36,6 @@ #include "impl_base.hpp" #include "depthwise_quantized.hpp" -#pragma once - -using namespace neon_convolution_kernels; -using namespace qasymm8; - -template -inline T saturating_doubling_high_mul(const T&, const int32_t&); - -template <> -inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b) -{ - return vqrdmulhq_n_s32(a, b); -} - -template <> -inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b) -{ - return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); -} - -template -inline T rounding_divide_by_exp2(const T& x, const int exponent); - -template <> -inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent) -{ - const int32x4_t shift = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); - const int32x4_t fixed = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed, shift); -} - -template <> -inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent) -{ - const int32x2_t shift = vdup_n_s32(-exponent); - const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); - const int32x2_t fixed = vqadd_s32(x, fixup); - return vrshl_s32(fixed, shift); -} - -template <> -inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent) -{ - const int32x2_t xs = vdup_n_s32(x); - return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); -} - namespace depthwise { template < diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp new file mode 100644 index 0000000000..b27430c242 --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * + * NOTE: Header to be included by implementation files only. + * + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ + +#include + +#include "arm.hpp" +#include "impl_base.hpp" +#include "depthwise_quantized.hpp" + +#pragma once + +namespace { + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols, + typename FInput, typename FOutput +> +static inline void tilefn_hybrid( + int n_channels, + const void* packed_params, + FInput &get_input_ptr, + FOutput &get_output_ptr, + int32_t clamp_min, + int32_t clamp_max, + uint8_t input_offset, + uint8_t output_offset +) +{ + constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; + constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; + + // Offset into channels + int channel = 0; + + // Byte type pointer to weights and biases + const int8_t *wbptr = static_cast(packed_params); + + for (; n_channels >= 8; n_channels -= 8, channel += 8) + { + const int32x4_t biases[2] = { + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), + }; + const int32x4_t multipliers[2] = { + vld1q_s32(reinterpret_cast(wbptr) + 8), + vld1q_s32(reinterpret_cast(wbptr) + 12), + }; + const int32x4_t shifts[2] = { + vld1q_s32(reinterpret_cast(wbptr) + 16), + vld1q_s32(reinterpret_cast(wbptr) + 20), + }; + wbptr += 24*sizeof(int32_t); + + int16x8_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + const auto w = vld1_s8(wbptr); + weights[i][j] = reinterpret_cast(vmovl_s8(w)); + wbptr += 8; + } + } + + int16x8_t inputs[InnerTileRows][InnerTileCols]; + const uint8x8_t ioffset = vdup_n_u8(input_offset); + for (unsigned int i = 0; i < InnerTileRows; i++) + { + for (unsigned int j = 0; j < InnerTileCols; j++) + { + const auto x = vld1_u8(get_input_ptr(i, j, channel)); + inputs[i][j] = reinterpret_cast(vsubl_u8(x, ioffset)); + } + } + + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + int32x4_t accs[2]; + for (unsigned int i = 0; i < 2; i++) + { + accs[i] = biases[i]; + } + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + const auto w = weights[wi][wj]; + const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; + accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x)); + accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x)); + } + } + + int32x4_t final_accs[2]; + for (unsigned int i = 0; i < 2; i++) + { + const int32x4_t y = rounding_divide_by_exp2( + saturating_doubling_high_mul(accs[i], multipliers[i]), + shifts[i]); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(output_offset)); + final_accs[i] = vaddq_s32(y, offset); + final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); + final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); + } + + const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), + vreinterpretq_s16_s32(final_accs[1])); + const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]); + const uint8x8_t output = + vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0])); + + vst1_u8(get_output_ptr(oi, oj, channel), output); + } + } + } + + for (; n_channels; n_channels--, channel++) + { + // Load bias + const int32_t bias = *reinterpret_cast(wbptr); + const int32_t multiplier = *reinterpret_cast(wbptr + sizeof(int32_t)); + const int32_t shift = *reinterpret_cast(wbptr + 2*sizeof(int32_t)); + + wbptr += 3*sizeof(int32_t); + + // Load weights + int16_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = *(wbptr++); + } + } + + // Load the input activations + int16_t inputs[InnerTileRows][InnerTileCols]; + for (unsigned int i = 0; i < InnerTileRows; i++) + { + for (unsigned int j = 0; j < InnerTileCols; j++) + { + inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; + } + } + + // Perform the convolution + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + int32_t acc = bias; + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + acc += w * x; + } + } + + // Requantize + acc = rounding_divide_by_exp2( + saturating_doubling_high_mul(acc, multiplier), + -shift); + acc += output_offset; + acc = std::max(acc, clamp_min); + acc = std::min(acc, clamp_max); + uint8_t output = static_cast(acc); + *(get_output_ptr(oi, oj, channel)) = output; + } + } + } +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols, + typename FInput, typename FOutput +> +static inline void execute_tilefn_hybrid( + int n_channels, + const void* packed_params, + const ActivationFunction actfn, + const qasymm8::QAsymm8Params &input_quant, + const qasymm8::QAsymm8Params &output_quant, + FInput &get_input_ptr, + FOutput &get_output_ptr) { + + // Compute min/max clamp values + int32_t clamp_min = std::numeric_limits::min(); + int32_t clamp_max = std::numeric_limits::max(); + + if (actfn == ActivationFunction::ReLU) { + clamp_min = output_quant.offset; + } + + // Disabling Relu6 for now + if (actfn == ActivationFunction::ReLU6) { + const int32_t top_rail = output_quant.quantize(6.0f); + clamp_max = std::min(clamp_max, top_rail); + } + + // Call the tile execution method + tilefn_hybrid(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset); +} +} + + + +namespace depthwise { +using namespace qsymm8; +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +QSymm8HybridPerChannelDepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::QSymm8HybridPerChannelDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + const ActivationFunction activation, + const QSymm8PerChannelParams& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : QSymm8HybridPerChannelDepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, + activation, weight_quantisation, input_quantisation, output_quantisation, + QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), + padding_top, padding_left, padding_bottom, padding_right + ) +{ +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +QSymm8HybridPerChannelDepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::QSymm8HybridPerChannelDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + const ActivationFunction activation, + const QSymm8PerChannelParams& weight_quantisation, + const qasymm8::QAsymm8Params& input_quantisation, + const qasymm8::QAsymm8Params& output_quantisation, + const QSymm8PerChannelRescaleParams& rescale_params, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : Base( + n_batches, n_input_rows, n_input_cols, n_channels, activation, + padding_top, padding_left, padding_bottom, padding_right + ), + _weights_quant(weight_quantisation), + _input_quant(input_quantisation), + _output_quant(output_quantisation), + _rescale_parameters(rescale_params) +{ +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +uint8_t QSymm8HybridPerChannelDepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::_input_padding_value(void) const +{ + return _input_quant.offset; +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +void QSymm8HybridPerChannelDepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::_pack_params( + void * const buffer, + const void * const weights, + const unsigned int weight_row_stride, + const unsigned int weight_col_stride, + const void * const biases +) const +{ + const int8_t *wptr = static_cast(weights); + const int32_t *bptr = static_cast(biases); + const int32_t *mptr = static_cast(_rescale_parameters.multipliers.data()); + const int32_t *sptr = static_cast(_rescale_parameters.shifts.data()); + int8_t *outptr = static_cast(buffer); + + // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE + // For SVE set this to half the vector length. + unsigned int veclen = 8; + + // While there are channels left to process, pack a vector length of them at + // a time and reduce the size of vector used as the size of the tensor + // decreases. + for ( + unsigned int n_channels = this->n_channels(); n_channels; + n_channels -= veclen, + outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols) + ) + { + // NOTE Ignore this section if using SVE, the vector length remains the + // same and we just don't fill a full register for the tail. + while (n_channels < veclen) + { + // Reduce the vector length to either 8 or 1 (scalar) + // TODO Support more vector lengths in `execute_tile`. + veclen = (veclen == 16) ? 8 : 1; + } + + // Get pointers to bias and weight portions of the output structure. + int32_t *out_bptr = reinterpret_cast(outptr); + int32_t *out_mptr = reinterpret_cast(outptr + veclen*sizeof(int32_t)); + int32_t *out_sptr = reinterpret_cast(outptr + 2*veclen*sizeof(int32_t)); + int8_t *out_wptr = outptr + 3*veclen*sizeof(int32_t); + + // Copy a vector length of elements + for (unsigned int n = 0; n < veclen && n < n_channels; n++) + { + const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; + const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0; + const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0; + + out_bptr[n] = bias; + out_mptr[n] = multiplier; + out_sptr[n] = -shift; + + for (unsigned int i = 0; i < KernelRows; i++) + { + int8_t *row_outptr = out_wptr + i*KernelCols*veclen; + for (unsigned int j = 0; j < KernelCols; j++) + { + int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); + row_outptr[j*veclen + n] = w; + } + } + wptr++; + } + } +} + + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +template +void QSymm8HybridPerChannelDepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + uint8_t* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride +) { + + // Construct methods to get pointers + const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( + const int i, const int j, const int channel) { + return inptr + i * in_row_stride + j * in_col_stride + channel; + }; + + const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( + const int i, const int j, const int channel) { + return outptr + i * out_row_stride + j * out_col_stride + channel; + }; + + execute_tilefn_hybrid( + n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr); +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +template +void QSymm8HybridPerChannelDepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] +) { + // Construct methods to get pointers + const auto get_input_ptr = [inptrs](const int i, const int j, + const int channel) { + return inptrs[i][j] + channel; + }; + + const auto get_output_ptr = [outptrs](const int i, const int j, + const int channel) { + return outptrs[i][j] + channel; + }; + + // Call the tile execution method + execute_tilefn_hybrid( + n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr); +} + +} // namespace depthwise diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 6cf7b97e66..5e47dd56ae 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -40,7 +40,10 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if(!is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1); const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); @@ -55,7 +58,7 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); } - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type()); if(is_quantized) { @@ -67,7 +70,6 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo ARM_COMPUTE_UNUSED(multiplier); ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); } - if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation)) { TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); @@ -88,7 +90,6 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); } - return Status{}; } } // namespace diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp index 92ad93e4a7..c564e22d46 100644 --- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp @@ -84,6 +84,48 @@ std::unique_ptr get_qasymm8_convolver(int kern } } +std::unique_ptr get_qsymm8_perchannel_convolver(int kernel_size, int stride_x, + int n_batches, int in_rows, int in_cols, int n_channels, + neon_convolution_kernels::ActivationFunction activation, + const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, + const qsymm8::QSymm8PerChannelRescaleParams &rescale_params, + int padding_top, int padding_left, int padding_bottom, int padding_right) +{ + switch(kernel_size) + { + case 3: + { + switch(stride_x) + { + case 1: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + case 5: + { + switch(stride_x) + { + case 1: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + case 2: + return arm_compute::support::cpp14::make_unique>( + n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + default: + return nullptr; + } + } + default: + return nullptr; + } +} + #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC std::unique_ptr get_fp16_convolver(int kernel_size, int stride_x, int n_batches, int in_rows, int in_cols, int n_channels, @@ -187,6 +229,9 @@ std::unique_ptr create_convolver(const ITensor const int padding_bottom = conv_info.pad_bottom(); const int padding_right = conv_info.pad_right(); + const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8); + const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL); + const unsigned int stride_x = conv_info.stride().first; const unsigned int kernel_size = weights->info()->tensor_shape().y(); @@ -202,7 +247,7 @@ std::unique_ptr create_convolver(const ITensor } // Create quantized convolver - if(data_type == DataType::QASYMM8) + if(is_uniform_quantized) { const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform(); @@ -226,6 +271,40 @@ std::unique_ptr create_convolver(const ITensor return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); } + else if(is_perchannel_quantized) + { + const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); + const QuantizationInfo weights_qinfo = weights->info()->quantization_info(); + const UniformQuantizationInfo output_qinfo = output->info()->quantization_info().uniform(); + + // Check that quantization info are in the range [0, 255] + ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); + ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); + const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; + const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() }; + const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; + + // Calculate rescale parameters + std::vector fmultipliers; + std::vector qmultipliers; + std::vector qshifts; + + for(auto const s : wqinfo.scales) + { + const float fmultipler = iqinfo.scale * s / oqinfo.scale; + int qmultiplier = 0; + int qshift = 0; + quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); + fmultipliers.push_back(fmultipler); + qmultipliers.push_back(qmultiplier); + qshifts.push_back(qshift); + } + + qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers); + + return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation, + wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); + } else { // Create float convolver @@ -328,7 +407,10 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); // Validate convolver @@ -378,7 +460,7 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso // Check data type const DataType data_type = weights->data_type(); - bool is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type); + bool is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type) || data_type == DataType::QSYMM8_PER_CHANNEL; // Check weighs size std::set supported_kernel_sizes = { 3, 5 }; @@ -402,7 +484,12 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); bool supported_padding = is_same_padding || is_valid_padding; // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - bool is_dilation_supported = (dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1); + bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1)); + + if(data_type == DataType::QSYMM8_PER_CHANNEL) + { + is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U)); + } return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported; } diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp index 37d2373d7b..6d8c083c3f 100644 --- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp @@ -680,6 +680,33 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPe } TEST_SUITE_END() // Dilation TEST_SUITE_END() // Generic + +TEST_SUITE(Optimized) +FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(), + framework::dataset::make("DepthMultiplier", 1)), + framework::dataset::make("InputDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(), + framework::dataset::make("DepthMultiplier", 1)), + framework::dataset::make("InputDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // Optimized TEST_SUITE_END() // QSYMM8_PER_CHANNEL TEST_SUITE_END() // Quantized -- cgit v1.2.1