/* * Copyright (c) 2019 ARM Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! * * NOTE: Header to be included by implementation files only. * * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ #include #include "arm.hpp" #include "impl_base.hpp" #include "depthwise_quantized.hpp" namespace depthwise { template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::QAsymm8DepthwiseConvolution( int n_batches, int n_input_rows, int n_input_cols, int n_channels, const ActivationFunction activation, const QAsymm8Params& weight_quantisation, const QAsymm8Params& input_quantisation, const QAsymm8Params& output_quantisation, unsigned int padding_top, unsigned int padding_left, unsigned int padding_bottom, unsigned int padding_right ) : QAsymm8DepthwiseConvolution( n_batches, n_input_rows, n_input_cols, n_channels, activation, weight_quantisation, input_quantisation, output_quantisation, QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), padding_top, padding_left, padding_bottom, padding_right ) { } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::QAsymm8DepthwiseConvolution( int n_batches, int n_input_rows, int n_input_cols, int n_channels, int n_output_rows, int n_output_cols, const ActivationFunction activation, const QAsymm8Params& weight_quantisation, const QAsymm8Params& input_quantisation, const QAsymm8Params& output_quantisation, unsigned int padding_top, unsigned int padding_left, unsigned int padding_bottom, unsigned int padding_right ) : QAsymm8DepthwiseConvolution( n_batches, n_input_rows, n_input_cols, n_channels, n_output_rows, n_output_cols, activation, weight_quantisation, input_quantisation, output_quantisation, QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), padding_top, padding_left, padding_bottom, padding_right ) { } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::QAsymm8DepthwiseConvolution( int n_batches, int n_input_rows, int n_input_cols, int n_channels, const ActivationFunction activation, const QAsymm8Params& weight_quantisation, const QAsymm8Params& input_quantisation, const QAsymm8Params& output_quantisation, const QAsymm8RescaleParams& rescale_params, unsigned int padding_top, unsigned int padding_left, unsigned int padding_bottom, unsigned int padding_right ) : Base( n_batches, n_input_rows, n_input_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right ), _weights_quant(weight_quantisation), _inputs_quant(input_quantisation), _output_quant(output_quantisation), rescale_parameters(rescale_params) { } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::QAsymm8DepthwiseConvolution( int n_batches, int n_input_rows, int n_input_cols, int n_channels, int n_output_rows, int n_output_cols, const ActivationFunction activation, const QAsymm8Params& weight_quantisation, const QAsymm8Params& input_quantisation, const QAsymm8Params& output_quantisation, const QAsymm8RescaleParams& rescale_params, unsigned int padding_top, unsigned int padding_left, unsigned int padding_bottom, unsigned int padding_right ) : Base( n_batches, n_input_rows, n_input_cols, n_channels, n_output_rows, n_output_cols, activation, padding_top, padding_left, padding_bottom, padding_right ), _weights_quant(weight_quantisation), _inputs_quant(input_quantisation), _output_quant(output_quantisation), rescale_parameters(rescale_params) { } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > uint8_t QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::_input_padding_value(void) const { return _inputs_quant.offset; } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > void QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::_pack_params( void * const buffer, const void * const weights, const unsigned int weight_row_stride, const unsigned int weight_col_stride, const void * const biases ) const { const uint8_t *wptr = static_cast(weights); const int32_t *bptr = static_cast(biases); uint8_t *outptr = static_cast(buffer); // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE // For SVE set this to half the vector length. unsigned int veclen = 8; // While there are channels left to process, pack a vector length of them at // a time and reduce the size of vector used as the size of the tensor // decreases. for ( unsigned int n_channels = this->n_channels(); n_channels; n_channels -= veclen, outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols) ) { // NOTE Ignore this section if using SVE, the vector length remains the // same and we just don't fill a full register for the tail. while (n_channels < veclen) { // Reduce the vector length to either 8 or 1 (scalar) // TODO Support more vector lengths in `execute_tile`. veclen = (veclen == 16) ? 8 : 1; } // Get pointers to bias and weight portions of the output structure. int32_t *out_bptr = reinterpret_cast(outptr); uint8_t *out_wptr = outptr + veclen*sizeof(int32_t); // Copy a vector length of elements for (unsigned int n = 0; n < veclen && n < n_channels; n++) { const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; out_bptr[n] = bias; for (unsigned int i = 0; i < KernelRows; i++) { uint8_t *row_outptr = out_wptr + i*KernelCols*veclen; for (unsigned int j = 0; j < KernelCols; j++) { uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); row_outptr[j*veclen + n] = w; } } wptr++; } } } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols, typename FInput, typename FOutput > static inline void tilefn( int n_channels, const void* packed_params, FInput &get_input_ptr, FOutput &get_output_ptr, const int32_t clamp_max, const int32_t clamp_min, const uint8_t input_offset, const uint8_t weight_offset, const uint8_t output_offset, const int32_t requant_multiplier, const int32_t requant_shift ) { constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; // Offset into channels int channel = 0; // Byte type pointer to weights and biases const uint8_t *wbptr = static_cast(packed_params); for (; n_channels >= 8; n_channels -= 8, channel += 8) { const int32x4_t biases[2] = { vld1q_s32(reinterpret_cast(wbptr)), vld1q_s32(reinterpret_cast(wbptr) + 4), }; wbptr += 8*sizeof(int32_t); int16x8_t weights[KernelRows][KernelCols]; const uint8x8_t woffset = vdup_n_u8(weight_offset); for (unsigned int i = 0; i < KernelRows; i++) { for (unsigned int j = 0; j < KernelCols; j++) { const uint8x8_t w = vld1_u8(wbptr); weights[i][j] = reinterpret_cast(vsubl_u8(w, woffset)); wbptr += 8; } } int16x8_t inputs[InnerTileRows][InnerTileCols]; const uint8x8_t ioffset = vdup_n_u8(input_offset); for (unsigned int i = 0; i < InnerTileRows; i++) { for (unsigned int j = 0; j < InnerTileCols; j++) { const auto x = vld1_u8(get_input_ptr(i, j, channel)); inputs[i][j] = reinterpret_cast(vsubl_u8(x, ioffset)); } } for (unsigned int oi = 0; oi < OutputTileRows; oi++) { for (unsigned int oj = 0; oj < OutputTileCols; oj++) { int32x4_t acc_a = biases[0], acc_b = biases[1]; for (unsigned int wi = 0; wi < KernelRows; wi++) { for (unsigned int wj = 0; wj < KernelCols; wj++) { const auto w = weights[wi][wj]; const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; #ifndef __aarch64__ acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x)); acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x)); #else asm("smlal %[acc_a].4s, %[w].4h, %[x].4h\n" "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n" : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b) : [w] "w"(w), [x] "w"(x)); #endif // __aarch64__ } } int32x4_t final_accs[2]; for (unsigned int i = 0; i < 2; i++) { const int32x4_t y = rounding_divide_by_exp2( saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier), requant_shift); const int32x4_t offset = reinterpret_cast(vdupq_n_u32(output_offset)); final_accs[i] = vaddq_s32(y, offset); final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); } #ifndef __aarch64__ const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), vreinterpretq_s16_s32(final_accs[1])); const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]); const int8x16x2_t zoutput = vuzpq_s8(elems, elems); const uint8x8_t output = vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0])); vst1_u8(get_output_ptr(oi, oj, channel), output); #else const int8x16_t elems = vreinterpretq_s8_s16( vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]), vreinterpretq_s16_s32(final_accs[1]))); const uint8x8_t output = vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems))); vst1_u8(get_output_ptr(oi, oj, channel), output); #endif // __aarch64__ } } } for (; n_channels; n_channels--, channel++) { // Load bias const int32_t bias = *reinterpret_cast(wbptr); wbptr += sizeof(int32_t); // Load weights int16_t weights[KernelRows][KernelCols]; for (unsigned int i = 0; i < KernelRows; i++) { for (unsigned int j = 0; j < KernelCols; j++) { weights[i][j] = *(wbptr++) - weight_offset; } } // Load the input activations int16_t inputs[InnerTileRows][InnerTileCols]; for (unsigned int i = 0; i < InnerTileRows; i++) { for (unsigned int j = 0; j < InnerTileCols; j++) { inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; } } // Perform the convolution for (unsigned int oi = 0; oi < OutputTileRows; oi++) { for (unsigned int oj = 0; oj < OutputTileCols; oj++) { int32_t acc = bias; for (unsigned int wi = 0; wi < KernelRows; wi++) { for (unsigned int wj = 0; wj < KernelCols; wj++) { const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; acc += w * x; } } // Requantize acc = rounding_divide_by_exp2( saturating_doubling_high_mul(acc, requant_multiplier), requant_shift); acc += output_offset; acc = std::max(acc, clamp_min); acc = std::min(acc, clamp_max); uint8_t output = static_cast(acc); *(get_output_ptr(oi, oj, channel)) = output; } } } } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols, typename FInput, typename FOutput > static inline void execute_tilefn( int n_channels, const void* packed_params, const nck::ActivationFunction actfn, FInput &get_input_ptr, FOutput &get_output_ptr, const QAsymm8Params &input_quant, const QAsymm8Params &weight_quant, const QAsymm8Params &output_quant, const QAsymm8RescaleParams &requant ) { // Compute min/max clamp values int32_t clamp_min = std::numeric_limits::min(); int32_t clamp_max = std::numeric_limits::max(); if (actfn == nck::ActivationFunction::ReLU || actfn == nck::ActivationFunction::ReLU6) { const int32_t bottom_rail = output_quant.offset; clamp_min = std::max(clamp_min, bottom_rail); } if (actfn == nck::ActivationFunction::ReLU6) { const int32_t top_rail = output_quant.quantize(6.0f); clamp_max = std::min(clamp_max, top_rail); } // Call the tile execution method tilefn(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_max, clamp_min, input_quant.offset, weight_quant.offset, output_quant.offset, requant.multiplier, requant.shift); } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > template void QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::execute_tile( int n_channels, const void* packed_params, const uint8_t* inptr, unsigned int in_row_stride, unsigned int in_col_stride, uint8_t* outptr, unsigned int out_row_stride, unsigned int out_col_stride ) { // Construct methods to get pointers const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( const int i, const int j, const int channel) { return inptr + i * in_row_stride + j * in_col_stride + channel; }; const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( const int i, const int j, const int channel) { return outptr + i * out_row_stride + j * out_col_stride + channel; }; execute_tilefn( n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, _inputs_quant, _weights_quant, _output_quant, rescale_parameters); } template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, unsigned int StrideRows, unsigned int StrideCols > template void QAsymm8DepthwiseConvolution< OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols >::execute_tile( int n_channels, const void* packed_params, const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] ) { // Construct methods to get pointers const auto get_input_ptr = [inptrs](const int i, const int j, const int channel) { return inptrs[i][j] + channel; }; const auto get_output_ptr = [outptrs](const int i, const int j, const int channel) { return outptrs[i][j] + channel; }; // Call the tile execution method execute_tilefn( n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, _inputs_quant, _weights_quant, _output_quant, rescale_parameters); } } // namespace depthwise