From 30271c779c36a2abe6995c4454674d92bbc1f91f Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 24 Jun 2019 14:56:34 +0100 Subject: COMPMID-2156: Optimized dilated convolution for NEON. Change-Id: I3a8abe8cc9637c8983d9bd69dcbaee1a15eac8d0 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/1492 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez --- .../NEON/kernels/convolution/common/padding.cpp | 4 +- .../NEON/kernels/convolution/common/qasymm8.cpp | 2 +- .../convolution/depthwise/depthwise_dilated.cpp | 32 + .../depthwise/depthwise_dilated_qa8_qa8.cpp | 142 +++ .../depthwise/depthwise_pack_parameters.cpp | 2 +- .../convolution/depthwise/impl_fp16_fp16.hpp | 29 +- .../convolution/depthwise/impl_fp32_fp32.hpp | 28 +- .../kernels/convolution/depthwise/impl_qa8_qa8.hpp | 997 ++++++++++++++++----- 8 files changed, 996 insertions(+), 240 deletions(-) create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp (limited to 'src/core') diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp index b50067b4e0..88b37b8a83 100644 --- a/src/core/NEON/kernels/convolution/common/padding.cpp +++ b/src/core/NEON/kernels/convolution/common/padding.cpp @@ -24,8 +24,8 @@ #include #include -#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp" +#include "arm.hpp" +#include "padding.hpp" namespace padding { diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.cpp b/src/core/NEON/kernels/convolution/common/qasymm8.cpp index 1de9ebf28a..64e3156bff 100644 --- a/src/core/NEON/kernels/convolution/common/qasymm8.cpp +++ b/src/core/NEON/kernels/convolution/common/qasymm8.cpp @@ -28,7 +28,7 @@ #include #include -#include "arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp" +#include "qasymm8.hpp" namespace qasymm8 { diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp new file mode 100644 index 0000000000..3e2bbbb61a --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "impl_dilated.hpp" + +template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; +template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; +template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; +template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; +template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; +template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>; diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp new file mode 100644 index 0000000000..879e06158d --- /dev/null +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "depthwise_quantized_dilated.hpp" +#include "impl_dilated.hpp" + +namespace depthwise { + +template +QAsymm8DilatedDepthwiseConvolution:: + QAsymm8DilatedDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int dilation_factor, nck::ActivationFunction activation, + const qasymm8::QAsymm8Params &weight_quantisation, + const qasymm8::QAsymm8Params &input_quantisation, + const qasymm8::QAsymm8Params &output_quantisation, + unsigned int padding_top, unsigned int padding_left, + unsigned int padding_bottom, unsigned int padding_right) + : QAsymm8DilatedDepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, + QAsymm8DilatedDepthwiseConvolution::get_output_size( + n_input_rows, padding_top, padding_bottom, dilation_factor), + QAsymm8DilatedDepthwiseConvolution::get_output_size( + n_input_cols, padding_left, padding_right, dilation_factor), + activation, weight_quantisation, input_quantisation, + output_quantisation, padding_top, padding_left, padding_bottom, + padding_right) {} + +template +QAsymm8DilatedDepthwiseConvolution:: + QAsymm8DilatedDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int dilation_factor, int n_output_rows, int n_output_cols, + nck::ActivationFunction activation, + const qasymm8::QAsymm8Params &weight_quantisation, + const qasymm8::QAsymm8Params &input_quantisation, + const qasymm8::QAsymm8Params &output_quantisation, + unsigned int padding_top, unsigned int padding_left, + unsigned int padding_bottom, unsigned int padding_right) + : QAsymm8DilatedDepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, + n_output_rows, n_output_cols, activation, weight_quantisation, + input_quantisation, output_quantisation, + qasymm8::QAsymm8RescaleParams::make_rescale_params( + weight_quantisation, input_quantisation, output_quantisation), + padding_top, padding_left, padding_bottom, padding_right) {} + +template +QAsymm8DilatedDepthwiseConvolution:: + QAsymm8DilatedDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int dilation_factor, nck::ActivationFunction activation, + const qasymm8::QAsymm8Params &weight_quantisation, + const qasymm8::QAsymm8Params &input_quantisation, + const qasymm8::QAsymm8Params &output_quantisation, + const qasymm8::QAsymm8RescaleParams &rescale_parameters, + unsigned int padding_top, unsigned int padding_left, + unsigned int padding_bottom, unsigned int padding_right) + : QAsymm8DilatedDepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, + QAsymm8DilatedDepthwiseConvolution::get_output_size( + n_input_rows, padding_top, padding_bottom, dilation_factor), + QAsymm8DilatedDepthwiseConvolution::get_output_size( + n_input_cols, padding_left, padding_right, dilation_factor), + activation, weight_quantisation, input_quantisation, + output_quantisation, rescale_parameters, padding_top, padding_left, + padding_bottom, padding_right) {} + +template +QAsymm8DilatedDepthwiseConvolution:: + QAsymm8DilatedDepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int dilation_factor, int n_output_rows, int n_output_cols, + nck::ActivationFunction activation, + const qasymm8::QAsymm8Params &weight_quantisation, + const qasymm8::QAsymm8Params &input_quantisation, + const qasymm8::QAsymm8Params &output_quantisation, + const qasymm8::QAsymm8RescaleParams &rescale_parameters, + unsigned int padding_top, unsigned int padding_left, + unsigned int padding_bottom, unsigned int padding_right) + : DilatedDepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, + n_output_rows, n_output_cols, activation, padding_top, padding_left, + padding_bottom, padding_right, + [weight_quantisation, input_quantisation, output_quantisation, + rescale_parameters]( + const int n_batches, const int n_input_rows, + const int n_input_cols, const int n_channels, + const int n_output_rows, const int n_output_cols, + const nck::ActivationFunction activation, + const unsigned int padding_top, const unsigned int padding_left, + const unsigned int padding_bottom, + const unsigned int padding_right) -> IDepthwiseConvolution * { + return new QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, + StrideRows, StrideCols>( + n_batches, n_input_rows, n_input_cols, n_channels, + n_output_rows, n_output_cols, activation, weight_quantisation, + input_quantisation, output_quantisation, rescale_parameters, + padding_top, padding_left, padding_bottom, padding_right); + }) {} + +} // namespace depthwise + +template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>; +template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>; diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp index 692086c74a..f86f1bad73 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp" +#include "impl_base.hpp" // TODO Move to common utilities somewhere template struct DType { }; diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp index cbdb19a067..87d2bfd8e6 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp @@ -30,8 +30,8 @@ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp" +#include "arm.hpp" +#include "impl_base.hpp" #pragma once @@ -63,6 +63,31 @@ DepthwiseConvolution< { } +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, StrideRows, StrideCols, + float16_t, float16_t, float16_t +>::DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int n_output_rows, int n_output_cols, + ActivationFunction activation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : Base( + n_batches, n_input_rows, n_input_cols, n_channels, + n_output_rows, n_output_cols, activation, + padding_top, padding_left, padding_bottom, padding_right + ) +{ +} + template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp index 264576137c..e19e4c668c 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp @@ -30,8 +30,8 @@ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ -#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp" +#include "arm.hpp" +#include "impl_base.hpp" #pragma once @@ -63,6 +63,30 @@ DepthwiseConvolution< { } +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, StrideRows, StrideCols, + float, float, float +>::DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int n_output_rows, int n_output_cols, + ActivationFunction activation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : Base( + n_batches, n_input_rows, n_input_cols, n_channels, + n_output_rows, n_output_cols, activation, + padding_top, padding_left, padding_bottom, padding_right + ) +{ +} template < unsigned int OutputTileRows, unsigned int OutputTileCols, diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp index 5546d37e59..bda875dfe1 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp @@ -32,15 +32,38 @@ #include -#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp" -#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp" +#include "arm.hpp" +#include "impl_base.hpp" +#include "depthwise_quantized.hpp" #pragma once +// Comment the following to use floating-point based quantisation, leave +// uncommented to use fixed-point. +#define FIXED_POINT_REQUANTISATION 1 + using namespace neon_convolution_kernels; using namespace qasymm8; +template +struct clamp_to_limits +{ + template + static inline U clamp(const U& v) + { + const std::numeric_limits limits; + const U min = static_cast(limits.min()); + const U max = static_cast(limits.max()); + return std::min(std::max(v, min), max); + } + + template + static inline T clamp_and_cast(const U& v) + { + return static_cast(clamp(v)); + } +}; + template inline T saturating_doubling_high_mul(const T&, const int32_t&); @@ -87,103 +110,214 @@ inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent) namespace depthwise { template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols > QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right -) + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + const ActivationFunction activation, + const QAsymm8Params& weight_quantisation, + const QAsymm8Params& input_quantisation, + const QAsymm8Params& output_quantisation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : QAsymm8DepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, + activation, weight_quantisation, input_quantisation, output_quantisation, + QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), + padding_top, padding_left, padding_bottom, padding_right + ) { } template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols > QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right -), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int n_output_rows, int n_output_cols, + const ActivationFunction activation, + const QAsymm8Params& weight_quantisation, + const QAsymm8Params& input_quantisation, + const QAsymm8Params& output_quantisation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : QAsymm8DepthwiseConvolution( + n_batches, n_input_rows, n_input_cols, n_channels, + n_output_rows, n_output_cols, + activation, weight_quantisation, input_quantisation, output_quantisation, + QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), + padding_top, padding_left, padding_bottom, padding_right + ) { } template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::QAsymm8DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + const ActivationFunction activation, + const QAsymm8Params& weight_quantisation, + const QAsymm8Params& input_quantisation, + const QAsymm8Params& output_quantisation, + const QAsymm8RescaleParams& rescale_params, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : Base( + n_batches, n_input_rows, n_input_cols, n_channels, + get_activation_fn(activation, output_quantisation), + padding_top, padding_left, padding_bottom, padding_right + ), + _weights_quant(weight_quantisation), + _inputs_quant(input_quantisation), + _output_quant(output_quantisation), + rescale_parameters(rescale_params) +{ +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::QAsymm8DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + int n_output_rows, int n_output_cols, + const ActivationFunction activation, + const QAsymm8Params& weight_quantisation, + const QAsymm8Params& input_quantisation, + const QAsymm8Params& output_quantisation, + const QAsymm8RescaleParams& rescale_params, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right +) : Base( + n_batches, n_input_rows, n_input_cols, n_channels, + n_output_rows, n_output_cols, + get_activation_fn(activation, output_quantisation), + padding_top, padding_left, padding_bottom, padding_right + ), + _weights_quant(weight_quantisation), + _inputs_quant(input_quantisation), + _output_quant(output_quantisation), + rescale_parameters(rescale_params) +{ +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +ActivationFunction QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::get_activation_fn( + const ActivationFunction activation, + const QAsymm8Params& output_quant +) +{ + if ( + (activation == ActivationFunction::ReLU && + output_quant.quantize(0) == 0) || + (activation == ActivationFunction::ReLU6 && + output_quant.quantize(0) == 0 && + output_quant.dequantize(255) <= 6.0f) + ) + { + // If the range of values which can be represented by a quantized value are + // within the range that would be produced by the activation function, then + // the activation function is redundant and can be skipped. + return ActivationFunction::None; + } + else if( + activation == ActivationFunction::ReLU6 && + output_quant.dequantize(255) <= 6.0f + ) + { + // If the largest value that can be represented by a quantized value is + // lower than the upper boundary, then the activation function can be + // relaxed to a ReLU. + return ActivationFunction::ReLU; + } + + return activation; +} + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols > uint8_t QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::_input_padding_value(void) const { return _inputs_quant.offset; } template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols > void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases - ) const + void * const buffer, + const void * const weights, + const unsigned int weight_row_stride, + const unsigned int weight_col_stride, + const void * const biases +) const { const uint8_t *wptr = static_cast(weights); const int32_t *bptr = static_cast(biases); uint8_t *outptr = static_cast(buffer); - // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE - // For SVE set this to half the vector length. + // We set the vector length to use quad registers on Aarch64 and only doubles + // on Aarch32. NOTE For SVE set this to the actual vector length. +#if defined(__aarch64__) + unsigned int veclen = 16; +#else +#if defined(__arm__) unsigned int veclen = 8; +#endif +#endif + + // Compute the rank 0 offset arising from the quantisation parameters. + const int32_t rank0_offset = (KernelRows * KernelCols * + static_cast(_weights_quant.offset) * + static_cast(_inputs_quant.offset)); // While there are channels left to process, pack a vector length of them at // a time and reduce the size of vector used as the size of the tensor // decreases. for ( - unsigned int n_channels = this->n_channels(); n_channels; - n_channels -= veclen, - outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols) - ) + unsigned int n_channels = this->n_channels(); n_channels; + n_channels -= veclen, + outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols) + ) { // NOTE Ignore this section if using SVE, the vector length remains the // same and we just don't fill a full register for the tail. @@ -201,8 +335,8 @@ void QAsymm8DepthwiseConvolution< // Copy a vector length of elements for (unsigned int n = 0; n < veclen && n < n_channels; n++) { - const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; - out_bptr[n] = bias; + int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; + uint32_t weight_sum = 0; for (unsigned int i = 0; i < KernelRows; i++) { @@ -211,140 +345,297 @@ void QAsymm8DepthwiseConvolution< { uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); row_outptr[j*veclen + n] = w; + weight_sum += static_cast(w); } } wptr++; + + // Include in the bias contributions from the quantisation offset + int32_t rank1_offset = static_cast( + static_cast(_inputs_quant.offset) * weight_sum + ); + out_bptr[n] = bias + rank0_offset - rank1_offset; } } } template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols > -static inline void tilefn( - int n_channels, - const void* packed_params, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const int32_t clamp_max, - const int32_t clamp_min, - const uint8_t input_offset, - const uint8_t weight_offset, - const uint8_t output_offset, - const int32_t requant_multiplier, - const int32_t requant_shift - ) +template +void QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptr, + const unsigned int in_row_stride, + const unsigned int in_col_stride, + uint8_t* outptr, + const unsigned int out_row_stride, + const unsigned int out_col_stride +) { - constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; - constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; - - // Offset into channels - int channel = 0; + // Activation parameters (unused if Activation is None) + const uint8_t aqmin = _output_quant.offset; + const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ? + std::min(255u, _output_quant.quantize(6.0f)) : 255u; // Byte type pointer to weights and biases const uint8_t *wbptr = static_cast(packed_params); - for (; n_channels >= 8; n_channels -= 8, channel += 8) +#if defined(__aarch64__) // Under Aarch64 only use quad registers + for (; n_channels >= 16; n_channels -= 16) + { + // Load biases + const int32x4_t biases[4] = { + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), + vld1q_s32(reinterpret_cast(wbptr) + 8), + vld1q_s32(reinterpret_cast(wbptr) + 12) + }; + wbptr += 16*sizeof(int32_t); + + // Load weights + uint8x16_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = vld1q_u8(wbptr); + wbptr += 16; + } + } + + // Load the input activations + uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = vld1q_u8(inptr + i*in_row_stride + j*in_col_stride); + } + } + inptr += 16; + + // Perform the convolution + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + // Two sets of operations are required, we perform the + // multiply-accumulates for the convolution proper but must also sum + // the tile elements to account for the _weight_ offset. + uint32x4_t accs[4]; + for (unsigned int i = 0; i < 4; i++) + { + accs[i] = reinterpret_cast(biases[i]); + } + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + // Get relevant weight and activation pixel + const uint8x16_t w = weights[wi][wj]; + const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + + // Perform multiplication and accumulation + const uint16x8_t muls[2] = { + vmull_u8(vget_low_u8(w), vget_low_u8(x)), + vmull_u8(vget_high_u8(w), vget_high_u8(x)) + }; + + const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset); + const uint16x8_t sum_elems[2] = { + vmull_u8(vget_low_u8(x), woffset), + vmull_u8(vget_high_u8(x), woffset) + }; + + const uint32x4_t tmps[4] = { + vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])), + vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])), + vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])), + vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])), + }; + for (unsigned int i = 0; i < 4; i++) + { + accs[i] = vaddq_u32(accs[i], tmps[i]); + } + } + } + + // Rescale the accumulator and add in the new offset. + uint32x4_t final_accs[4]; + for (unsigned int i = 0; i < 4; i++) + { +#ifdef FIXED_POINT_REQUANTISATION + const int32x4_t y = rounding_divide_by_exp2( + saturating_doubling_high_mul( + reinterpret_cast(accs[i]), rescale_parameters.multiplier + ), + rescale_parameters.shift + ); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(_output_quant.offset)); + final_accs[i] = reinterpret_cast(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0))); +#else // floating point requantisation + float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast(accs[i])); + fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale)); + fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast(_output_quant.offset))); + fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f)); + final_accs[i] = vcvtq_u32_f32(fp_acc); +#endif + } + + uint8x16_t output = vcombine_u8( + vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))), + vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3]))) + ); + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = vmaxq_u8(output, vdupq_n_u8(aqmin)); + } + if (Activation == ActivationFunction::ReLU6) + { + output = vminq_u8(output, vdupq_n_u8(aqmax)); + } + + vst1q_u8(outptr + oi*out_row_stride + oj*out_col_stride, output); + } + } + outptr += 16; + } +#endif // defined(__aarch64__) + for (; n_channels >= 8; n_channels -= 8) { const int32x4_t biases[2] = { - vld1q_s32(reinterpret_cast(wbptr)), - vld1q_s32(reinterpret_cast(wbptr) + 4), + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), }; wbptr += 8*sizeof(int32_t); - int16x8_t weights[KernelRows][KernelCols]; - const uint8x8_t woffset = vdup_n_u8(weight_offset); + uint8x8_t weights[KernelRows][KernelCols]; for (unsigned int i = 0; i < KernelRows; i++) { for (unsigned int j = 0; j < KernelCols; j++) { - const uint8x8_t w = vld1_u8(wbptr); - weights[i][j] = reinterpret_cast(vsubl_u8(w, woffset)); + weights[i][j] = vld1_u8(wbptr); wbptr += 8; } } - int16x8_t inputs[InnerTileRows][InnerTileCols]; - const uint8x8_t ioffset = vdup_n_u8(input_offset); - for (unsigned int i = 0; i < InnerTileRows; i++) + uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) { - for (unsigned int j = 0; j < InnerTileCols; j++) + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) { - const auto x = vld1_u8(get_input_ptr(i, j, channel)); - inputs[i][j] = reinterpret_cast(vsubl_u8(x, ioffset)); + inputs[i][j] = vld1_u8(inptr + i*in_row_stride + j*in_col_stride); } } + inptr += 8; for (unsigned int oi = 0; oi < OutputTileRows; oi++) { for (unsigned int oj = 0; oj < OutputTileCols; oj++) { - int32x4_t accs[2]; + uint32x4_t accs[2]; for (unsigned int i = 0; i < 2; i++) { - accs[i] = biases[i]; + accs[i] = reinterpret_cast(biases[i]); } for (unsigned int wi = 0; wi < KernelRows; wi++) { for (unsigned int wj = 0; wj < KernelCols; wj++) { - const auto w = weights[wi][wj]; - const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; - accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x)); - accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x)); + const uint8x8_t w = weights[wi][wj]; + const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + + const uint16x8_t muls = vmull_u8(w, x); + const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset); + const uint16x8_t sum_elems = vmull_u8(x, woffset); + + const uint32x4_t tmps[2] = { + vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)), + vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)), + }; + for (unsigned int i = 0; i < 2; i++) + { + accs[i] = vaddq_u32(accs[i], tmps[i]); + } } } - int32x4_t final_accs[2]; + uint32x4_t final_accs[2]; for (unsigned int i = 0; i < 2; i++) { +#ifdef FIXED_POINT_REQUANTISATION const int32x4_t y = rounding_divide_by_exp2( - saturating_doubling_high_mul(accs[i], requant_multiplier), - requant_shift); - const int32x4_t offset = reinterpret_cast(vdupq_n_u32(output_offset)); - final_accs[i] = vaddq_s32(y, offset); - final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); - final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); + saturating_doubling_high_mul( + reinterpret_cast(accs[i]), rescale_parameters.multiplier + ), + rescale_parameters.shift + ); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(_output_quant.offset)); + final_accs[i] = reinterpret_cast(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0))); +#else // floating point requantisation + float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast(accs[i])); + fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale)); + fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast(_output_quant.offset))); + fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f)); + final_accs[i] = vcvtq_u32_f32(fp_acc); +#endif } - const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1])); - const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0])); - vst1_u8(get_output_ptr(oi, oj, channel), output); + uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))); + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = vmax_u8(output, vdup_n_u8(aqmin)); + } + if (Activation == ActivationFunction::ReLU6) + { + output = vmin_u8(output, vdup_n_u8(aqmax)); + } + + vst1_u8(outptr + oi*out_row_stride + oj*out_col_stride, output); } } + outptr += 8; } - for (; n_channels; n_channels--, channel++) + for (; n_channels; n_channels--) { // Load bias const int32_t bias = *reinterpret_cast(wbptr); wbptr += sizeof(int32_t); // Load weights - int16_t weights[KernelRows][KernelCols]; + uint8_t weights[KernelRows][KernelCols]; for (unsigned int i = 0; i < KernelRows; i++) { for (unsigned int j = 0; j < KernelCols; j++) { - weights[i][j] = *(wbptr++) - weight_offset; + weights[i][j] = *(wbptr++); } } // Load the input activations - int16_t inputs[InnerTileRows][InnerTileCols]; - for (unsigned int i = 0; i < InnerTileRows; i++) + uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) { - for (unsigned int j = 0; j < InnerTileCols; j++) + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) { - inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; + inputs[i][j] = *(inptr + i*in_row_stride + j*in_col_stride); } } + inptr++; // Perform the convolution for (unsigned int oi = 0; oi < OutputTileRows; oi++) @@ -352,135 +643,377 @@ static inline void tilefn( for (unsigned int oj = 0; oj < OutputTileCols; oj++) { int32_t acc = bias; + uint32_t element_sum = 0; for (unsigned int wi = 0; wi < KernelRows; wi++) { for (unsigned int wj = 0; wj < KernelCols; wj++) { const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; - acc += w * x; + acc += static_cast(static_cast(w) * static_cast(x)); + element_sum += static_cast(x); } } + acc -= static_cast(element_sum) * static_cast(_weights_quant.offset); + // Requantize +#ifdef FIXED_POINT_REQUANTISATION acc = rounding_divide_by_exp2( - saturating_doubling_high_mul(acc, requant_multiplier), - requant_shift); - acc += output_offset; - acc = std::max(acc, clamp_min); - acc = std::min(acc, clamp_max); - uint8_t output = static_cast(acc); - *(get_output_ptr(oi, oj, channel)) = output; + saturating_doubling_high_mul(acc, rescale_parameters.multiplier), + rescale_parameters.shift + ); + acc += _output_quant.offset; + uint8_t output = clamp_to_limits::clamp_and_cast(acc); +#else // floating point requantization + float fp_acc = static_cast(acc); + fp_acc *= rescale_parameters.rescale; + fp_acc += static_cast(_output_quant.offset); + fp_acc = std::max(fp_acc, 0.0f); + uint8_t output = static_cast(std::min(static_cast(fp_acc), 255)); +#endif + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = std::max(output, aqmin); + } + if (Activation == ActivationFunction::ReLU6) + { + output = std::min(output, aqmax); + } + + *(outptr + oi*out_row_stride + oj*out_col_stride) = output; } } + outptr++; } } template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols > -static inline void execute_tilefn( - int n_channels, - const void* packed_params, - const nck::ActivationFunction actfn, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const QAsymm8Params &input_quant, - const QAsymm8Params &weight_quant, - const QAsymm8Params &output_quant, - const QAsymm8RescaleParams &requant - ) { - // Compute min/max clamp values - int32_t clamp_min = std::numeric_limits::min(); - int32_t clamp_max = std::numeric_limits::max(); - - if (actfn == nck::ActivationFunction::ReLU || - actfn == nck::ActivationFunction::ReLU6) { - const int32_t bottom_rail = output_quant.offset; - clamp_min = std::max(clamp_min, bottom_rail); +template +void QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + // Activation parameters (unused if Activation is None) + const uint8_t aqmin = _output_quant.offset; + const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ? + std::min(255u, _output_quant.quantize(6.0f)) : 255u; + + // Byte type pointer to weights and biases + const uint8_t *wbptr = static_cast(packed_params); + + // Offset into input/output tensors + int n = 0; + +#if defined(__aarch64__) // Under Aarch64 only use quad registers + for (; n_channels >= 16; n_channels -= 16, n += 16) + { + // Load biases + const int32x4_t biases[4] = { + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), + vld1q_s32(reinterpret_cast(wbptr) + 8), + vld1q_s32(reinterpret_cast(wbptr) + 12) + }; + wbptr += 16*sizeof(int32_t); + + // Load weights + uint8x16_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = vld1q_u8(wbptr); + wbptr += 16; + } + } + + // Load the input activations + uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = vld1q_u8(inptrs[i][j] + n); + } + } + + // Perform the convolution + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + // Two sets of operations are required, we perform the + // multiply-accumulates for the convolution proper but must also sum + // the tile elements to account for the _weight_ offset. + uint32x4_t accs[4]; + for (unsigned int i = 0; i < 4; i++) + { + accs[i] = reinterpret_cast(biases[i]); + } + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + // Get relevant weight and activation pixel + const uint8x16_t w = weights[wi][wj]; + const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + + // Perform multiplication and accumulation + const uint16x8_t muls[2] = { + vmull_u8(vget_low_u8(w), vget_low_u8(x)), + vmull_u8(vget_high_u8(w), vget_high_u8(x)) + }; + + const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset); + const uint16x8_t sum_elems[2] = { + vmull_u8(vget_low_u8(x), woffset), + vmull_u8(vget_high_u8(x), woffset) + }; + + const uint32x4_t tmps[4] = { + vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])), + vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])), + vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])), + vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])), + }; + for (unsigned int i = 0; i < 4; i++) + { + accs[i] = vaddq_u32(accs[i], tmps[i]); + } + } + } + + // Rescale the accumulator and add in the new offset. + uint32x4_t final_accs[4]; + for (unsigned int i = 0; i < 4; i++) + { +#ifdef FIXED_POINT_REQUANTISATION + const int32x4_t y = rounding_divide_by_exp2( + saturating_doubling_high_mul( + reinterpret_cast(accs[i]), rescale_parameters.multiplier + ), + rescale_parameters.shift + ); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(_output_quant.offset)); + final_accs[i] = reinterpret_cast(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0))); +#else // floating point requantisation + float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast(accs[i])); + fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale)); + fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast(_output_quant.offset))); + fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f)); + final_accs[i] = vcvtq_u32_f32(fp_acc); +#endif + } + + uint8x16_t output = vcombine_u8( + vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))), + vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3]))) + ); + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = vmaxq_u8(output, vdupq_n_u8(aqmin)); + } + if (Activation == ActivationFunction::ReLU6) + { + output = vminq_u8(output, vdupq_n_u8(aqmax)); + } + + vst1q_u8(outptrs[oi][oj] + n, output); + } + } } +#endif // defined(__aarch64__) + for (; n_channels >= 8; n_channels -= 8, n += 8) + { + const int32x4_t biases[2] = { + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), + }; + wbptr += 8*sizeof(int32_t); + + uint8x8_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = vld1_u8(wbptr); + wbptr += 8; + } + } + + uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = vld1_u8(inptrs[i][j] + n); + } + } + + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + uint32x4_t accs[2]; + for (unsigned int i = 0; i < 2; i++) + { + accs[i] = reinterpret_cast(biases[i]); + } + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + const uint8x8_t w = weights[wi][wj]; + const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + + const uint16x8_t muls = vmull_u8(w, x); + const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset); + const uint16x8_t sum_elems = vmull_u8(x, woffset); + + const uint32x4_t tmps[2] = { + vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)), + vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)), + }; + for (unsigned int i = 0; i < 2; i++) + { + accs[i] = vaddq_u32(accs[i], tmps[i]); + } + } + } + + uint32x4_t final_accs[2]; + for (unsigned int i = 0; i < 2; i++) + { +#ifdef FIXED_POINT_REQUANTISATION + const int32x4_t y = rounding_divide_by_exp2( + saturating_doubling_high_mul( + reinterpret_cast(accs[i]), rescale_parameters.multiplier + ), + rescale_parameters.shift + ); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(_output_quant.offset)); + final_accs[i] = reinterpret_cast(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0))); +#else // floating point requantisation + float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast(accs[i])); + fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale)); + fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast(_output_quant.offset))); + fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f)); + final_accs[i] = vcvtq_u32_f32(fp_acc); +#endif + } + + uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))); + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = vmax_u8(output, vdup_n_u8(aqmin)); + } + if (Activation == ActivationFunction::ReLU6) + { + output = vmin_u8(output, vdup_n_u8(aqmax)); + } - if (actfn == nck::ActivationFunction::ReLU6) { - const int32_t top_rail = output_quant.quantize(6.0f); - clamp_max = std::min(clamp_max, top_rail); + vst1_u8(outptrs[oi][oj] + n, output); + } + } } + for (; n_channels; n_channels--, n++) + { + // Load bias + const int32_t bias = *reinterpret_cast(wbptr); + wbptr += sizeof(int32_t); - // Call the tile execution method - tilefn(n_channels, packed_params, get_input_ptr, get_output_ptr, - clamp_max, clamp_min, input_quant.offset, - weight_quant.offset, output_quant.offset, - requant.multiplier, requant.shift); -} + // Load weights + uint8_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = *(wbptr++); + } + } -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ) { - // Construct methods to get pointers - const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( - const int i, const int j, const int channel) { - return inptr + i * in_row_stride + j * in_col_stride + channel; - }; - - const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( - const int i, const int j, const int channel) { - return outptr + i * out_row_stride + j * out_col_stride + channel; - }; - - execute_tilefn( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} + // Load the input activations + uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = *(inptrs[i][j] + n); + } + } -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ) { - // Construct methods to get pointers - const auto get_input_ptr = [inptrs](const int i, const int j, - const int channel) { - return inptrs[i][j] + channel; - }; - - const auto get_output_ptr = [outptrs](const int i, const int j, - const int channel) { - return outptrs[i][j] + channel; - }; - - // Call the tile execution method - execute_tilefn( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); + // Perform the convolution + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + int32_t acc = bias; + uint32_t element_sum = 0; + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + acc += static_cast(static_cast(w) * static_cast(x)); + element_sum += static_cast(x); + } + } + + acc -= static_cast(element_sum) * static_cast(_weights_quant.offset); + + // Requantize +#ifdef FIXED_POINT_REQUANTISATION + acc = rounding_divide_by_exp2( + saturating_doubling_high_mul(acc, rescale_parameters.multiplier), + rescale_parameters.shift + ); + acc += _output_quant.offset; + uint8_t output = clamp_to_limits::clamp_and_cast(acc); +#else // floating point requantization + float fp_acc = static_cast(acc); + fp_acc *= rescale_parameters.rescale; + fp_acc += static_cast(_output_quant.offset); + fp_acc = std::max(fp_acc, 0.0f); + uint8_t output = static_cast(std::min(static_cast(fp_acc), 255)); +#endif + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = std::max(output, aqmin); + } + if (Activation == ActivationFunction::ReLU6) + { + output = std::min(output, aqmax); + } + + *(outptrs[oi][oj] + n) = output; + } + } + } } } // namespace depthwise -- cgit v1.2.1