1 files changed, 0 insertions, 457 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
deleted file mode 100644
index 68e20d98a9..0000000000
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- *
- *          NOTE: Header to be included by implementation files only.
- *
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-#include <limits>
-
-#include "arm.hpp"
-#include "impl_base.hpp"
-#include "depthwise_quantized.hpp"
-
-#pragma once
-
-namespace {
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void tilefn_hybrid(
-  int n_channels,
-  const void* packed_params,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr,
-  int32_t clamp_min,
-  int32_t clamp_max,
-  uint8_t input_offset,
-  uint8_t output_offset
-)
-{
-  constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
-  constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
-
-  // Offset into channels
-  int channel = 0;
-
-  // Byte type pointer to weights and biases
-  const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
-
-  for (; n_channels >= 8; n_channels -= 8, channel += 8)
-  {
-    const int32x4_t biases[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
-    };
-    const int32x4_t multipliers[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
-    };
-    const int32x4_t shifts[2] = {
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
-      vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
-    };
-    wbptr += 24*sizeof(int32_t);
-
-    int16x8_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        const auto w = vld1_s8(wbptr);
-        weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
-        wbptr += 8;
-      }
-    }
-
-    int16x8_t inputs[InnerTileRows][InnerTileCols];
-    const uint8x8_t ioffset = vdup_n_u8(input_offset);
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        const auto x = vld1_u8(get_input_ptr(i, j, channel));
-        inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
-      }
-    }
-
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32x4_t accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          accs[i] = biases[i];
-        }
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj];
-            const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
-            accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
-            accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
-          }
-        }
-
-        int32x4_t final_accs[2];
-        for (unsigned int i = 0; i < 2; i++)
-        {
-          const int32x4_t y = rounding_divide_by_exp2(
-              saturating_doubling_high_mul(accs[i], multipliers[i]),
-              shifts[i]);
-          const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
-          final_accs[i] = vaddq_s32(y, offset);
-          final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
-          final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
-        }
-
-        const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
-                                         vreinterpretq_s16_s32(final_accs[1]));
-        const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
-        const uint8x8_t output =
-                    vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
-
-        vst1_u8(get_output_ptr(oi, oj, channel), output);
-      }
-    }
-  }
-
-  for (; n_channels; n_channels--, channel++)
-  {
-    // Load bias
-    const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
-    const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
-    const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
-
-    wbptr += 3*sizeof(int32_t);
-
-    // Load weights
-    int16_t weights[KernelRows][KernelCols];
-    for (unsigned int i = 0; i < KernelRows; i++)
-    {
-      for (unsigned int j = 0; j < KernelCols; j++)
-      {
-        weights[i][j] = *(wbptr++);
-      }
-    }
-
-    // Load the input activations
-    int16_t inputs[InnerTileRows][InnerTileCols];
-    for (unsigned int i = 0; i < InnerTileRows; i++)
-    {
-      for (unsigned int j = 0; j < InnerTileCols; j++)
-      {
-        inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
-      }
-    }
-
-    // Perform the convolution
-    for (unsigned int oi = 0; oi < OutputTileRows; oi++)
-    {
-      for (unsigned int oj = 0; oj < OutputTileCols; oj++)
-      {
-        int32_t acc = bias;
-
-        for (unsigned int wi = 0; wi < KernelRows; wi++)
-        {
-          for (unsigned int wj = 0; wj < KernelCols; wj++)
-          {
-            const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
-            acc += w * x;
-          }
-        }
-
-        // Requantize
-        acc = rounding_divide_by_exp2(
-            saturating_doubling_high_mul(acc, multiplier),
-            -shift);
-        acc += output_offset;
-        acc = std::max(acc, clamp_min);
-        acc = std::min(acc, clamp_max);
-        uint8_t output = static_cast<uint8_t>(acc);
-        *(get_output_ptr(oi, oj, channel)) = output;
-      }
-    }
-  }
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename FInput, typename FOutput
->
-static inline void execute_tilefn_hybrid(
-  int n_channels,
-  const void* packed_params,
-  const ActivationFunction actfn,
-  const qasymm8::QAsymm8Params &input_quant,
-  const qasymm8::QAsymm8Params &output_quant,
-  FInput &get_input_ptr,
-  FOutput &get_output_ptr) {
-
-  // Compute min/max clamp values
-  int32_t clamp_min = std::numeric_limits<uint8_t>::min();
-  int32_t clamp_max = std::numeric_limits<uint8_t>::max();
-
-  if (actfn == ActivationFunction::ReLU) {
-    clamp_min = output_quant.offset;
-  }
-
-  // Disabling Relu6 for now
-  if (actfn == ActivationFunction::ReLU6) {
-    const int32_t top_rail = output_quant.quantize(6.0f);
-    clamp_max = std::min(clamp_max, top_rail);
-  }
-
-  // Call the tile execution method
-  tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-         StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
-}
-}
-
-
-
-namespace depthwise {
-using namespace qsymm8;
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QSymm8PerChannelParams& weight_quantisation,
-  const qasymm8::QAsymm8Params& input_quantisation,
-  const qasymm8::QAsymm8Params& output_quantisation,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : QSymm8HybridPerChannelDepthwiseConvolution(
-    n_batches, n_input_rows, n_input_cols, n_channels,
-    activation, weight_quantisation, input_quantisation, output_quantisation,
-    QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
-    padding_top, padding_left, padding_bottom, padding_right
-  )
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::QSymm8HybridPerChannelDepthwiseConvolution(
-  int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-  const ActivationFunction activation,
-  const QSymm8PerChannelParams& weight_quantisation,
-  const qasymm8::QAsymm8Params& input_quantisation,
-  const qasymm8::QAsymm8Params& output_quantisation,
-  const QSymm8PerChannelRescaleParams& rescale_params,
-  unsigned int padding_top,
-  unsigned int padding_left,
-  unsigned int padding_bottom,
-  unsigned int padding_right
-) : Base(
-      n_batches, n_input_rows, n_input_cols, n_channels, activation,
-      padding_top, padding_left, padding_bottom, padding_right
-  ),
-  _weights_quant(weight_quantisation),
-  _input_quant(input_quantisation),
-  _output_quant(output_quantisation),
-  _rescale_parameters(rescale_params)
-{
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_input_padding_value(void) const
-{
-  return _input_quant.offset;
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::_pack_params(
-  void * const buffer,
-  const void * const weights,
-  const unsigned int weight_row_stride,
-  const unsigned int weight_col_stride,
-  const void * const biases
-) const
-{
-  const int8_t *wptr = static_cast<const int8_t *>(weights);
-  const int32_t *bptr = static_cast<const int32_t *>(biases);
-  const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
-  const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
-  int8_t *outptr = static_cast<int8_t *>(buffer);
-
-  // We set the vector length to use doubles on both Aarch64 and Aarch32.  NOTE
-  // For SVE set this to half the vector length.
-  unsigned int veclen = 8;
-
-  // While there are channels left to process, pack a vector length of them at
-  // a time and reduce the size of vector used as the size of the tensor
-  // decreases.
-  for (
-    unsigned int n_channels = this->n_channels(); n_channels;
-    n_channels -= veclen,
-    outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
-  )
-  {
-    // NOTE Ignore this section if using SVE, the vector length remains the
-    // same and we just don't fill a full register for the tail.
-    while (n_channels < veclen)
-    {
-      // Reduce the vector length to either 8 or 1 (scalar)
-      // TODO Support more vector lengths in `execute_tile`.
-      veclen = (veclen == 16) ? 8 : 1;
-    }
-
-    // Get pointers to bias and weight portions of the output structure.
-    int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
-    int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
-    int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
-    int8_t  *out_wptr = outptr + 3*veclen*sizeof(int32_t);
-
-    // Copy a vector length of elements
-    for (unsigned int n = 0; n < veclen && n < n_channels; n++)
-    {
-      const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
-      const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
-      const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
-
-      out_bptr[n] = bias;
-      out_mptr[n] = multiplier;
-      out_sptr[n] = -shift;
-
-      for (unsigned int i = 0; i < KernelRows; i++)
-      {
-        int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
-        for (unsigned int j = 0; j < KernelCols; j++)
-        {
-          int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
-          row_outptr[j*veclen + n] = w;
-        }
-      }
-      wptr++;
-    }
-  }
-}
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  uint8_t* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride
-) {
-
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
-      const int i, const int j, const int channel) {
-    return inptr + i * in_row_stride + j * in_col_stride + channel;
-  };
-
-  const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
-      const int i, const int j, const int channel) {
-    return outptr + i * out_row_stride + j * out_col_stride + channel;
-  };
-
-  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-template <ActivationFunction Activation>
-void QSymm8HybridPerChannelDepthwiseConvolution<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
->::execute_tile(
-  int n_channels,
-  const void* packed_params,
-  const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-  uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-) {
-  // Construct methods to get pointers
-  const auto get_input_ptr = [inptrs](const int i, const int j,
-                                      const int channel) {
-    return inptrs[i][j] + channel;
-  };
-
-  const auto get_output_ptr = [outptrs](const int i, const int j,
-                                        const int channel) {
-    return outptrs[i][j] + channel;
-  };
-
-  // Call the tile execution method
-  execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
-                 StrideRows, StrideCols>(
-      n_channels, packed_params, Activation,  _input_quant, _output_quant, get_input_ptr, get_output_ptr);
-}
-
-} // namespace depthwise