From d02d5edfa15ba6c04a9986a8a362a945cb38ac31 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 22 Jan 2021 09:47:04 +0000 Subject: Integrate improved CPU depthwise convolution kernels * Replace assembly kernels for depthwise convolution with more optimized ones. * Add int8 assembly kernels. * Fix implicit padding on optimized kernels Resolves: COMPMID-3867, COMPMID-4361 Change-Id: I0b0867e05f61be4f368f62190d55e14d0ab3ebf2 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5622 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../kernels/convolution/depthwise/impl_qa8_qa8.hpp | 511 --------------------- 1 file changed, 511 deletions(-) delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp (limited to 'src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp') diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp deleted file mode 100644 index e8b4c7bc0f..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include - -#include "arm.hpp" -#include "impl_base.hpp" -#include "depthwise_quantized.hpp" - -namespace depthwise -{ -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -uint8_t QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_input_padding_value(void) const -{ - return _inputs_quant.offset; -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - const uint8_t *wptr = static_cast(weights); - const int32_t *bptr = static_cast(biases); - uint8_t *outptr = static_cast(buffer); - - // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE - // For SVE set this to half the vector length. - unsigned int veclen = 8; - - // While there are channels left to process, pack a vector length of them at - // a time and reduce the size of vector used as the size of the tensor - // decreases. - for ( - unsigned int n_channels = this->n_channels(); n_channels; - n_channels -= veclen, - outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols) - ) - { - // NOTE Ignore this section if using SVE, the vector length remains the - // same and we just don't fill a full register for the tail. - while (n_channels < veclen) - { - // Reduce the vector length to either 8 or 1 (scalar) - // TODO Support more vector lengths in `execute_tile`. - veclen = (veclen == 16) ? 8 : 1; - } - - // Get pointers to bias and weight portions of the output structure. - int32_t *out_bptr = reinterpret_cast(outptr); - uint8_t *out_wptr = outptr + veclen*sizeof(int32_t); - - // Copy a vector length of elements - for (unsigned int n = 0; n < veclen && n < n_channels; n++) - { - const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; - out_bptr[n] = bias; - - for (unsigned int i = 0; i < KernelRows; i++) - { - uint8_t *row_outptr = out_wptr + i*KernelCols*veclen; - for (unsigned int j = 0; j < KernelCols; j++) - { - uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); - row_outptr[j*veclen + n] = w; - } - } - wptr++; - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void tilefn( - int n_channels, - const void* packed_params, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const int32_t clamp_max, - const int32_t clamp_min, - const uint8_t input_offset, - const uint8_t weight_offset, - const uint8_t output_offset, - const int32_t requant_multiplier, - const int32_t requant_shift -) -{ - constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; - constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; - - // Offset into channels - int channel = 0; - - // Byte type pointer to weights and biases - const uint8_t *wbptr = static_cast(packed_params); - - for (; n_channels >= 8; n_channels -= 8, channel += 8) - { - const int32x4_t biases[2] = { - vld1q_s32(reinterpret_cast(wbptr)), - vld1q_s32(reinterpret_cast(wbptr) + 4), - }; - wbptr += 8*sizeof(int32_t); - - int16x8_t weights[KernelRows][KernelCols]; - const uint8x8_t woffset = vdup_n_u8(weight_offset); - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - const uint8x8_t w = vld1_u8(wbptr); - weights[i][j] = reinterpret_cast(vsubl_u8(w, woffset)); - wbptr += 8; - } - } - - int16x8_t inputs[InnerTileRows][InnerTileCols]; - const uint8x8_t ioffset = vdup_n_u8(input_offset); - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - const auto x = vld1_u8(get_input_ptr(i, j, channel)); - inputs[i][j] = reinterpret_cast(vsubl_u8(x, ioffset)); - } - } - - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32x4_t acc_a = biases[0], acc_b = biases[1]; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj]; - const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; -#ifndef __aarch64__ - acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x)); - acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x)); -#else - asm("smlal %[acc_a].4s, %[w].4h, %[x].4h\n" - "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n" - : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b) - : [w] "w"(w), [x] "w"(x)); -#endif // __aarch64__ - } - } - - int32x4_t final_accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - const int32x4_t y = rounding_divide_by_exp2( - saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier), - requant_shift); - const int32x4_t offset = reinterpret_cast(vdupq_n_u32(output_offset)); - final_accs[i] = vaddq_s32(y, offset); - final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); - final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); - } - -#ifndef __aarch64__ - const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1])); - const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]); - - const int8x16x2_t zoutput = vuzpq_s8(elems, elems); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0])); - vst1_u8(get_output_ptr(oi, oj, channel), output); -#else - const int8x16_t elems = vreinterpretq_s8_s16( - vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1]))); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems))); - vst1_u8(get_output_ptr(oi, oj, channel), output); -#endif // __aarch64__ - } - } - } - for (; n_channels; n_channels--, channel++) - { - // Load bias - const int32_t bias = *reinterpret_cast(wbptr); - wbptr += sizeof(int32_t); - - // Load weights - int16_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - weights[i][j] = *(wbptr++) - weight_offset; - } - } - - // Load the input activations - int16_t inputs[InnerTileRows][InnerTileCols]; - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; - } - } - - // Perform the convolution - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32_t acc = bias; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; - acc += w * x; - } - } - - // Requantize - acc = rounding_divide_by_exp2( - saturating_doubling_high_mul(acc, requant_multiplier), - requant_shift); - acc += output_offset; - acc = std::max(acc, clamp_min); - acc = std::min(acc, clamp_max); - uint8_t output = static_cast(acc); - *(get_output_ptr(oi, oj, channel)) = output; - } - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void execute_tilefn( - int n_channels, - const void* packed_params, - const nck::ActivationFunction actfn, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const QAsymm8Params &input_quant, - const QAsymm8Params &weight_quant, - const QAsymm8Params &output_quant, - const QAsymm8RescaleParams &requant -) { - // Compute min/max clamp values - int32_t clamp_min = std::numeric_limits::min(); - int32_t clamp_max = std::numeric_limits::max(); - - if (actfn == nck::ActivationFunction::ReLU || - actfn == nck::ActivationFunction::ReLU6) { - const int32_t bottom_rail = output_quant.offset; - clamp_min = std::max(clamp_min, bottom_rail); - } - - if (actfn == nck::ActivationFunction::ReLU6) { - const int32_t top_rail = output_quant.quantize(6.0f); - clamp_max = std::min(clamp_max, top_rail); - } - - // Call the tile execution method - tilefn(n_channels, packed_params, get_input_ptr, get_output_ptr, - clamp_max, clamp_min, input_quant.offset, - weight_quant.offset, output_quant.offset, - requant.multiplier, requant.shift); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( - const int i, const int j, const int channel) { - return inptr + i * in_row_stride + j * in_col_stride + channel; - }; - - const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( - const int i, const int j, const int channel) { - return outptr + i * out_row_stride + j * out_col_stride + channel; - }; - - execute_tilefn( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptrs](const int i, const int j, - const int channel) { - return inptrs[i][j] + channel; - }; - - const auto get_output_ptr = [outptrs](const int i, const int j, - const int channel) { - return outptrs[i][j] + channel; - }; - - // Call the tile execution method - execute_tilefn( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} - -} // namespace depthwise -- cgit v1.2.1