aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp')
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp457
1 files changed, 0 insertions, 457 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
deleted file mode 100644
index 68e20d98a9..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- * NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-#pragma once
-
-namespace {
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
->
-static inline void tilefn_hybrid(
- int n_channels,
- const void* packed_params,
- FInput &get_input_ptr,
- FOutput &get_output_ptr,
- int32_t clamp_min,
- int32_t clamp_max,
- uint8_t input_offset,
- uint8_t output_offset
-)
-{
- constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
- constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
- // Offset into channels
- int channel = 0;
-
- // Byte type pointer to weights and biases
- const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
-
- for (; n_channels >= 8; n_channels -= 8, channel += 8)
- {
- const int32x4_t biases[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
- };
- const int32x4_t multipliers[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
- };
- const int32x4_t shifts[2] = {
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
- vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
- };
- wbptr += 24*sizeof(int32_t);
-
- int16x8_t weights[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- const auto w = vld1_s8(wbptr);
- weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
- wbptr += 8;
- }
- }
-
- int16x8_t inputs[InnerTileRows][InnerTileCols];
- const uint8x8_t ioffset = vdup_n_u8(input_offset);
- for (unsigned int i = 0; i < InnerTileRows; i++)
- {
- for (unsigned int j = 0; j < InnerTileCols; j++)
- {
- const auto x = vld1_u8(get_input_ptr(i, j, channel));
- inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
- }
- }
-
- for (unsigned int oi = 0; oi < OutputTileRows; oi++)
- {
- for (unsigned int oj = 0; oj < OutputTileCols; oj++)
- {
- int32x4_t accs[2];
- for (unsigned int i = 0; i < 2; i++)
- {
- accs[i] = biases[i];
- }
-
- for (unsigned int wi = 0; wi < KernelRows; wi++)
- {
- for (unsigned int wj = 0; wj < KernelCols; wj++)
- {
- const auto w = weights[wi][wj];
- const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
- accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
- accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
- }
- }
-
- int32x4_t final_accs[2];
- for (unsigned int i = 0; i < 2; i++)
- {
- const int32x4_t y = rounding_divide_by_exp2(
- saturating_doubling_high_mul(accs[i], multipliers[i]),
- shifts[i]);
- const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
- final_accs[i] = vaddq_s32(y, offset);
- final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
- final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
- }
-
- const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
- vreinterpretq_s16_s32(final_accs[1]));
- const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
- const uint8x8_t output =
- vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
-
- vst1_u8(get_output_ptr(oi, oj, channel), output);
- }
- }
- }
-
- for (; n_channels; n_channels--, channel++)
- {
- // Load bias
- const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
- const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
- const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
-
- wbptr += 3*sizeof(int32_t);
-
- // Load weights
- int16_t weights[KernelRows][KernelCols];
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- weights[i][j] = *(wbptr++);
- }
- }
-
- // Load the input activations
- int16_t inputs[InnerTileRows][InnerTileCols];
- for (unsigned int i = 0; i < InnerTileRows; i++)
- {
- for (unsigned int j = 0; j < InnerTileCols; j++)
- {
- inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
- }
- }
-
- // Perform the convolution
- for (unsigned int oi = 0; oi < OutputTileRows; oi++)
- {
- for (unsigned int oj = 0; oj < OutputTileCols; oj++)
- {
- int32_t acc = bias;
-
- for (unsigned int wi = 0; wi < KernelRows; wi++)
- {
- for (unsigned int wj = 0; wj < KernelCols; wj++)
- {
- const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
- acc += w * x;
- }
- }
-
- // Requantize
- acc = rounding_divide_by_exp2(
- saturating_doubling_high_mul(acc, multiplier),
- -shift);
- acc += output_offset;
- acc = std::max(acc, clamp_min);
- acc = std::min(acc, clamp_max);
- uint8_t output = static_cast<uint8_t>(acc);
- *(get_output_ptr(oi, oj, channel)) = output;
- }
- }
- }
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols,
- typename FInput, typename FOutput
->
-static inline void execute_tilefn_hybrid(
- int n_channels,
- const void* packed_params,
- const ActivationFunction actfn,
- const qasymm8::QAsymm8Params &input_quant,
- const qasymm8::QAsymm8Params &output_quant,
- FInput &get_input_ptr,
- FOutput &get_output_ptr) {
-
- // Compute min/max clamp values
- int32_t clamp_min = std::numeric_limits<uint8_t>::min();
- int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
- if (actfn == ActivationFunction::ReLU) {
- clamp_min = output_quant.offset;
- }
-
- // Disabling Relu6 for now
- if (actfn == ActivationFunction::ReLU6) {
- const int32_t top_rail = output_quant.quantize(6.0f);
- clamp_max = std::min(clamp_max, top_rail);
- }
-
- // Call the tile execution method
- tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
- StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
-}
-}
-
-
-
-namespace depthwise {
-using namespace qsymm8;
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QSymm8PerChannelParams& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : QSymm8HybridPerChannelDepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- activation, weight_quantisation, input_quantisation, output_quantisation,
- QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
- padding_top, padding_left, padding_bottom, padding_right
- )
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols, int n_channels,
- const ActivationFunction activation,
- const QSymm8PerChannelParams& weight_quantisation,
- const qasymm8::QAsymm8Params& input_quantisation,
- const qasymm8::QAsymm8Params& output_quantisation,
- const QSymm8PerChannelRescaleParams& rescale_params,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right
-) : Base(
- n_batches, n_input_rows, n_input_cols, n_channels, activation,
- padding_top, padding_left, padding_bottom, padding_right
- ),
- _weights_quant(weight_quantisation),
- _input_quant(input_quantisation),
- _output_quant(output_quantisation),
- _rescale_parameters(rescale_params)
-{
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
- return _input_quant.offset;
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-void QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
- void * const buffer,
- const void * const weights,
- const unsigned int weight_row_stride,
- const unsigned int weight_col_stride,
- const void * const biases
-) const
-{
- const int8_t *wptr = static_cast<const int8_t *>(weights);
- const int32_t *bptr = static_cast<const int32_t *>(biases);
- const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
- const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
- int8_t *outptr = static_cast<int8_t *>(buffer);
-
- // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
- // For SVE set this to half the vector length.
- unsigned int veclen = 8;
-
- // While there are channels left to process, pack a vector length of them at
- // a time and reduce the size of vector used as the size of the tensor
- // decreases.
- for (
- unsigned int n_channels = this->n_channels(); n_channels;
- n_channels -= veclen,
- outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
- )
- {
- // NOTE Ignore this section if using SVE, the vector length remains the
- // same and we just don't fill a full register for the tail.
- while (n_channels < veclen)
- {
- // Reduce the vector length to either 8 or 1 (scalar)
- // TODO Support more vector lengths in `execute_tile`.
- veclen = (veclen == 16) ? 8 : 1;
- }
-
- // Get pointers to bias and weight portions of the output structure.
- int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
- int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
- int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
- int8_t *out_wptr = outptr + 3*veclen*sizeof(int32_t);
-
- // Copy a vector length of elements
- for (unsigned int n = 0; n < veclen && n < n_channels; n++)
- {
- const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
- const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
- const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
-
- out_bptr[n] = bias;
- out_mptr[n] = multiplier;
- out_sptr[n] = -shift;
-
- for (unsigned int i = 0; i < KernelRows; i++)
- {
- int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
- for (unsigned int j = 0; j < KernelCols; j++)
- {
- int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
- row_outptr[j*veclen + n] = w;
- }
- }
- wptr++;
- }
- }
-}
-
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptr,
- unsigned int in_row_stride,
- unsigned int in_col_stride,
- uint8_t* outptr,
- unsigned int out_row_stride,
- unsigned int out_col_stride
-) {
-
- // Construct methods to get pointers
- const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
- const int i, const int j, const int channel) {
- return inptr + i * in_row_stride + j * in_col_stride + channel;
- };
-
- const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
- const int i, const int j, const int channel) {
- return outptr + i * out_row_stride + j * out_col_stride + channel;
- };
-
- execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-template <
- unsigned int OutputTileRows, unsigned int OutputTileCols,
- unsigned int KernelRows, unsigned int KernelCols,
- unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
- OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
- int n_channels,
- const void* packed_params,
- const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
- uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
- // Construct methods to get pointers
- const auto get_input_ptr = [inptrs](const int i, const int j,
- const int channel) {
- return inptrs[i][j] + channel;
- };
-
- const auto get_output_ptr = [outptrs](const int i, const int j,
- const int channel) {
- return outptrs[i][j] + channel;
- };
-
- // Call the tile execution method
- execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
- StrideRows, StrideCols>(
- n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-} // namespace depthwise