1 files changed, 274 insertions, 354 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index b33f2768ad..674fc4d2df 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -31,101 +31,73 @@
  */
 
 #include <algorithm>
+#include <cstdint>
 #include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
 
 #pragma once
 
-namespace depthwise
-{
+#define MEMBERFN(TOUT) template <\
+  unsigned int OutputTileRows, unsigned int OutputTileColumns,\
+  unsigned int KernelRows, unsigned int KernelColumns,\
+  unsigned int StrideRows, unsigned int StrideColumns,\
+  typename TIn, typename TBias, typename TOut,\
+  typename Derived\
+> TOUT DepthwiseConvolutionBase<\
+  OutputTileRows, OutputTileColumns,\
+  KernelRows, KernelColumns,\
+  StrideRows, StrideColumns,\
+  TIn, TBias, TOut, Derived\
+>
 
-const unsigned int CHANNEL_BLOCK = 16;
+using namespace neon_convolution_kernels;
 
-namespace
+namespace depthwise
 {
-  inline int pad_along_dim(
-    const bool padding_same,
-    const int kernel_dim,
-    const int stride_dim,
-    const int input_dim
-  )
-  {
-    if (!padding_same)
-      return 0;
-    if (input_dim % stride_dim)
-      return std::max(kernel_dim - (input_dim % stride_dim), 0);
-    else
-      return std::max(kernel_dim - stride_dim, 0);
-  }
-}  // namespace
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
-  const int dim_size, const bool same_padding
-)
+template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize>
+struct PackParameters
 {
-  return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR);
-}
+  static void execute(
+    unsigned int n_channels,
+    void *buffer,
+    const void *weights,
+    unsigned int weight_row_stride,
+    unsigned int weight_col_stride,
+    const void *biases
+  );
+};
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size(
+const unsigned int CHANNEL_BLOCK = 16;
+
+MEMBERFN(int)::get_output_size(
   const int dim_size, const unsigned int padding_before, const unsigned int padding_after
 )
 {
-  return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR);
+  return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows);
 }
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
-  const int n_batches, const int n_input_rows, const int n_input_cols,
-  const int n_channels, const bool padding_same,
-  const TIn* const weights,
-  const TIn* const input,
-  TOut* const output,
-  const int weight_col_stride,
-  const int weight_row_stride,
-  const int input_col_stride,
-  const int input_row_stride,
-  const int input_batch_stride,
-  const int output_col_stride,
-  const int output_row_stride,
-  const int output_batch_stride
-) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>(
-  n_batches, n_input_rows, n_input_cols,
-  n_channels,
-  pad_along_dim(padding_same, KR, SR, n_input_rows) / 2,  /* top padding */
-  pad_along_dim(padding_same, KC, SC, n_input_cols) / 2,  /* left padding */
-  iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2),  /* bottom padding */
-  iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2),  /* right padding */
-  weights, input, output,
-  weight_col_stride, weight_row_stride,
-  input_col_stride, input_row_stride, input_batch_stride,
-  output_col_stride, output_row_stride, output_batch_stride
-)
+MEMBERFN(int)::output_size(
+  const int dim_size, const unsigned int padding_before, const unsigned int padding_after
+) const
 {
+  return get_output_size(dim_size, padding_before, padding_after);
 }
 
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
-  const int n_batches, const int n_input_rows, const int n_input_cols,
+MEMBERFN()::DepthwiseConvolutionBase(
+  const int n_batches,
+  const int n_input_rows,
+  const int n_input_cols,
   const int n_channels,
+  ActivationFunction activation,
   const unsigned int padding_top,
   const unsigned int padding_left,
   const unsigned int padding_bottom,
-  const unsigned int padding_right,
-  const TIn* const weights,
-  const TIn* const input,
-  TOut* const output,
-  const int weight_col_stride,
-  const int weight_row_stride,
-  const int input_col_stride,
-  const int input_row_stride,
-  const int input_batch_stride,
-  const int output_col_stride,
-  const int output_row_stride,
-  const int output_batch_stride
-) : _weights(weights), _input(input), _output(output),
+  const unsigned int padding_right
+) : _input(nullptr), _output(nullptr),
+    _packed_parameters(nullptr),
+    _working_space(nullptr),
     _n_batches(n_batches),
     _n_input_rows(n_input_rows),
     _n_input_cols(n_input_cols),
@@ -138,37 +110,157 @@ DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
     _padding_left(padding_left),
     _padding_bottom(padding_bottom),
     _padding_right(padding_right),
-    _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels),
-    _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride),
-    _input_col_stride(input_col_stride ? input_col_stride : _n_channels),
-    _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride),
-    _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
-    _output_col_stride(output_col_stride ? output_col_stride : _n_channels),
-    _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
-    _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride),
-    _input_offset(0), _weights_offset(0)
+    _activation(activation),
+    _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0),
+    _input_ws_col_stride(_n_channels),
+    _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols),
+    _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0),
+    _output_ws_col_stride(_n_channels),
+    _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns)
 {
 }
 
+MEMBERFN(void)::set_input(const void* const inptr)
+{
+  set_input(inptr, _n_channels);
+}
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_col)
 {
-  // Parallelise over blocks of channels.
-  return iceildiv(_n_channels, CHANNEL_BLOCK);
+  set_input(inptr, _n_input_cols * ld_col, ld_col);
+}
+
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col)
+{
+  set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col);
+}
+
+MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col)
+{
+  _input = static_cast<const TIn *>(inptr);
+  _input_batch_stride = ld_batch;
+  _input_row_stride = ld_row;
+  _input_col_stride = ld_col;
+}
+
+MEMBERFN(void)::set_output(void* const outptr)
+{
+  set_output(outptr, _n_channels);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_col)
+{
+  set_output(outptr, _n_output_cols * ld_col, ld_col);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col)
+{
+  set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col);
+}
+
+MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col)
+{
+  _output = static_cast<TOut *>(outptr);
+  _output_batch_stride = ld_batch;
+  _output_row_stride = ld_row;
+  _output_col_stride = ld_col;
+}
+
+MEMBERFN(size_t)::get_packed_params_size(void) const
+{
+  return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+}
+
+MEMBERFN(void)::set_packed_params_buffer(void *buffer)
+{
+  _packed_parameters = buffer;
+}
+
+MEMBERFN(void)::pack_params(const void *weights, const void *biases) const
+{
+  static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases);
+}
+
+MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const
+{
+  const unsigned int weight_col_stride = _n_channels;
+  const unsigned int weight_row_stride = KernelColumns * weight_col_stride;
+  static_cast<const Derived *>(this)->pack_params(
+    buffer, weights, weight_row_stride, weight_col_stride, biases
+  );
+}
+
+MEMBERFN(void)::pack_params(
+  void * const buffer,
+  const void * const weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void * const biases
+) const
+{
+  static_cast<const Derived *>(this)->_pack_params(
+    buffer, weights, weight_row_stride, weight_col_stride, biases
+  );
+}
+
+MEMBERFN(void)::_pack_params(
+  void * const buffer,
+  const void * const weights,
+  const unsigned int weight_row_stride,
+  const unsigned int weight_col_stride,
+  const void * const biases
+) const
+{
+  // Default implementation
+  PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute(
+    _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases
+  );
+}
+
+MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const
+{
+  return nthreads * (
+    _get_input_working_space_size() + _get_output_working_space_size()
+  );
 }
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset)
+MEMBERFN(void)::set_working_space(void *buffer)
 {
-    _input_offset = input_offset;
-    _weights_offset = weights_offset;
+  _working_space = buffer;
 }
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
+MEMBERFN(size_t)::_get_input_working_space_size(void) const
+{
+  return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels;
+}
+
+MEMBERFN(size_t)::_get_output_working_space_size(void) const
+{
+  return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels;
+}
+
+MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const
+{
+  return static_cast<uint8_t*>(_working_space) + threadid * (
+    _get_input_working_space_size() + _get_output_working_space_size()
+  );
+}
+
+MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const
+{
+  return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size();
+}
+
+MEMBERFN(unsigned int)::get_window() const
+{
+  // Parallelise over blocks of channels.
+  return iceildiv(_n_channels, CHANNEL_BLOCK);
+}
+
+MEMBERFN(void)::run(
   const unsigned int start,
-  const unsigned int stop
+  const unsigned int stop,
+  const unsigned int threadid
 )
 {
   // Parallelise over blocks of channels
@@ -205,43 +297,38 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
       const int output_row_bottom = (tile_i + 1)*output_tile_rows;
       const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows);
 
+      // Get the offset into the packed parameters
+      const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) +
+        start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias));
+
       // Process the row
       process_tile_row(
+        threadid,
         stop_channel - start_channel,
-        _weights + start_channel, _weight_row_stride, _weight_col_stride,
-        inptr_row + start_channel, _input_row_stride, _input_col_stride,
-        outptr_row + start_channel, _output_row_stride, _output_col_stride,
+        params_ptr,
+        inptr_row + start_channel,
+        outptr_row + start_channel,
         input_row_pad_top, input_pad_left, input_row_pad_bottom,
         output_row_pad_bottom,
-        _n_tile_cols, _n_input_cols, _n_output_cols,
-        _input_offset, _weights_offset
+        _n_tile_cols, _n_input_cols, _n_output_cols
       );
     }
   }
 }
 
-
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row(
+MEMBERFN(void)::process_tile_row(
+  const unsigned int threadid,
   const int n_channels,
-  const TIn* const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
+  const void* const packed_params,
   const TIn* const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
   TOut* const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
   const int row_pad_in_top,
   const int row_pad_in_left,
   const int row_pad_in_bottom,
   const int row_pad_out_bottom,
   const int n_tiles,
   const int n_input_cols,
-  const int n_output_cols,
-  const int input_offset,
-  const int weights_offset
+  const int n_output_cols
 )
 {
   constexpr int tile_overlap = kernel_cols - stride_cols;
@@ -261,264 +348,97 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row
 
     // Get pointers into the inputs and outputs
     const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left;
-    const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride);
-    TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride;
-
-    // Apply the specific tile processing function
-    const bool pad_top = row_pad_in_top > 0;
-    const bool pad_left = t_pad_in_left > 0;
-    const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom;
-    const bool pad_right = t_pad_in_right || t_pad_out_right;
-
-    const TileFn tilefn = [&] () {
-      if (!pad_top && !pad_left && !pad_bottom && !pad_right)
-      {
-        // No padding
-        return tilefn_unpadded;
-      }
-      else if (pad_top && !pad_left && !pad_bottom && !pad_right)
-      {
-        // Padding on the top only, subtract off the minimum expected padding in
-        // order to index into the array of specialised methods.
-        const int index = row_pad_in_top - min_in_pad_top;
-        return tilefn_top[index];
-      }
-      else if (!pad_top && pad_left && !pad_bottom && !pad_right)
-      {
-        // Padding on the left only, subtract off the minimum expected padding in
-        // order to index into the array of specialised methods.
-        const int index = t_pad_in_left - min_in_pad_left;
-        return tilefn_left[index];
-      }
-      else if (!pad_top && !pad_left && pad_bottom && !pad_right)
-      {
-        // Padding on the bottom only
-        return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom];
-      }
-      else if (!pad_top && !pad_left && !pad_bottom && pad_right)
-      {
-        // Padding on the right only
-        return tilefn_right[t_pad_in_right][t_pad_out_right];
-      }
-      else
-      {
-        // Otherwise use generic tile processing method.
-        return tilefn_generic;
-      }
-    }();
-
-    tilefn(
-      n_channels,
-      weights, weight_row_stride, weight_col_stride,
-      inptr_col, in_row_stride, in_col_stride,
-      outptr_col, out_row_stride, out_col_stride,
-      row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
-      row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset
+    const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride);
+    TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride;
+
+    // Process just this tile
+    process_tile(
+      threadid, n_channels, packed_params, inptr_col, outptr_col,
+      row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,  // Input paddings
+      row_pad_out_bottom, t_pad_out_right  // Output paddings
     );
   }
 }
 
-
-// New templated struct used solely as a way to provide tile processing
-// specialisations.
-template <int OutputTileRows, int OutputTileCols,
-          int KernelRows, int KernelCols,
-          int StrideRows, int StrideCols,
-          typename TIn, typename TOut>
-struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols, TIn, TOut
->
+MEMBERFN(TIn)::_input_padding_value(void) const
 {
-  typedef DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TOut
-  > DWC;
-
-  /** Perform the depthwise convolution of a tile.
-   *
-   * @param[in] n_channels Number of channels.
-   * @param[in] weights Pointer to Height x Width x Channels ordered weights.
-   * @param[in] inptr Pointer to the top-left unpadded value of the tile.
-   * @param[in] in_row_stride Stride between rows of the input tensor.
-   * @param[in] in_col_stride Stride between columns of the input tensor.
-   * @param[out] outptr Pointer to the top-left output value for the tile.
-   * @param[in] out_row_stride Stride between rows of the output tensor.
-   * @param[in] out_col_stride Stride between columns of the output tensor.
-   *
-   * The following parameters may be ignored if the function has been
-   * specialised for specific padding constraints.
-   *
-   * @param[in] _in_pad_top Padding to apply to top of input tile.
-   * @param[in] _in_pad_left Padding to apply to left of input tile.
-   * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
-   * @param[in] _in_pad_right Padding to apply to right of input tile.
-   * @param[in] _out_pad_bottom Null cells at bottom of output tile.
-   * @param[in] _out_pad_right Null cells at right of output tile.
-   */
-  template <
-    bool Specialize=false,  // Specialize (or not) the method
-    int InPadTop=0,         // If specialized, top padding
-    int InPadLeft=0,        // If specialized, left padding
-    int InPadBottom=0,      // If specialized, bottom padding
-    int InPadRight=0,       // If specialized, right padding
-    int OutPadBottom=0,     // If specialized, bottom output padding
-    int OutPadRight=0       // If specialized, bottom right padding
-  >
-  static void process_tile(
-    const int n_channels,
-    const TIn* const weights,
-    const int weight_row_stride,
-    const int weight_col_stride,
-    const TIn* const inptr,
-    const int in_row_stride,
-    const int in_col_stride,
-    TOut* const outptr,
-    const int out_row_stride,
-    const int out_col_stride,
-    const int in_pad_top=0,
-    const int in_pad_left=0,
-    const int in_pad_bottom=0,
-    const int in_pad_right=0,
-    const int out_pad_bottom=0,
-    const int out_pad_right=0,
-    const int input_offset=0,
-    const int weights_offset=0
-  );
-};
-
+  return static_cast<TIn>(0);
+}
 
-template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
-template <
-  bool Specialize,
-  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
-  int OutPadBottom, int OutPadRight
->
-void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile(
+MEMBERFN(void)::process_tile(
+  const unsigned int threadid,
   const int n_channels,
-  const TIn *__restrict__ const weights,
-  const int weight_row_stride,
-  const int weight_col_stride,
-  const TIn *__restrict__ const inptr,
-  const int in_row_stride,
-  const int in_col_stride,
-  TOut *__restrict__ const outptr,
-  const int out_row_stride,
-  const int out_col_stride,
-  const int _in_pad_top,
-  const int _in_pad_left,
-  const int _in_pad_bottom,
-  const int _in_pad_right,
-  const int _out_pad_bottom,
-  const int _out_pad_right,
-  const int _input_offset,
-  const int _weights_offset
+  const void* const packed_params,
+  const TIn* const inptr,
+  TOut* const outptr,
+  const int pad_in_top,
+  const int pad_in_left,
+  const int pad_in_bottom,
+  const int pad_in_right,
+  const int pad_out_bottom,
+  const int pad_out_right
 )
 {
-  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
-  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
-  constexpr auto kernel_rows = DWC::kernel_rows;
-  constexpr auto kernel_cols = DWC::kernel_cols;
-  constexpr auto output_tile_rows = DWC::output_tile_rows;
-  constexpr auto output_tile_cols = DWC::output_tile_cols;
-  constexpr auto stride_rows = DWC::stride_rows;
-  constexpr auto stride_cols = DWC::stride_cols;
-
-  // Extract parameters
-  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
-  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
-  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
-  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
-  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
-  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
-
-  // Compute valid ranges of the tile
-  const int in_cells_i = inner_tile_rows - in_pad_bottom;
-  const int in_cells_j = inner_tile_cols - in_pad_right;
-  const int out_cells_i = output_tile_rows - out_pad_bottom;
-  const int out_cells_j = output_tile_cols - out_pad_right;
-
-  // Instantiate pointers
-  const TIn* __restrict__ inptr_base = inptr;
-  const TIn* __restrict__ wptr_base = weights;
-  TOut* __restrict__ outptr_base = outptr;
-
-  // Perform the depthwise convolution
-  int channels_remaining = n_channels;
-  for (; channels_remaining; channels_remaining--)
+  const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right;
+  const bool pad_output = pad_out_bottom || pad_out_right;
+
+  if (pad_input)
   {
-    // Load input tile
-    TIn u[inner_tile_rows][inner_tile_cols];
-    for (int i = 0; i < inner_tile_rows; i++)
-    {
-      const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
-      for (int j = 0; j < inner_tile_cols; j++)
-      {
-        if (i < in_pad_top || in_cells_i <= i ||
-            j < in_pad_left || in_cells_j <= j)
-        {
-          u[i][j] = static_cast<TIn>(0);
-        }
-        else
-        {
-          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
-        }
-      }
-    }
-    inptr_base++;
+    // Copy the input into the temporary buffer, applying padding
+    padding::copy_and_pad_tile<TIn>(
+      inner_tile_rows, inner_tile_cols, n_channels,
+      inptr, _input_row_stride, _input_col_stride,
+      static_cast<TIn *>(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride,
+      pad_in_top, pad_in_left, pad_in_bottom, pad_in_right,
+      static_cast<Derived *>(this)->_input_padding_value()
+    );
+  }
 
-    // Load weights tile
-    TIn w[kernel_rows][kernel_cols];
-    for (int i = 0; i < kernel_rows; i++)
-    {
-      const TIn* const wptr_row = wptr_base + i*weight_row_stride;
-      for (int j = 0; j < kernel_cols; j++)
-      {
-        w[i][j] = *(wptr_row + j*weight_col_stride);
-      }
-    }
-    wptr_base++;
+  // Execute the kernel
+  const TIn * const tile_inptr = !pad_input ? inptr : static_cast<const TIn *>(_get_input_working_space(threadid));
+  const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride;
+  const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride;
 
-    // Perform the convolution
-    TOut v[output_tile_rows][output_tile_cols];
-    for (int out_i = 0; out_i < out_cells_i; out_i++)
-    {
-      for (int out_j = 0; out_j < out_cells_j; out_j++)
-      {
-        // Clear the accumulator
-        v[out_i][out_j] = static_cast<TOut>(0);
-
-        // Base co-ordinate
-        const int base_i = out_i * stride_rows;
-        const int base_j = out_j * stride_cols;
-
-        // Fill the accumulator
-        for (int in_i = 0; in_i < kernel_rows; in_i++)
-        {
-          const int i = base_i + in_i;
-          for (int in_j = 0; in_j < kernel_cols; in_j++)
-          {
-            const int j = base_j + in_j;
-            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
-          }
-        }
-      }
-    }
+  TOut * const tile_outptr = !pad_output ? outptr : static_cast<TOut *>(_get_output_working_space(threadid));
+  const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride;
+  const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride;
 
-    // Store the output tile
-    for (int i = 0; i < out_cells_i; i++)
-    {
-      TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride;
-      for (int j = 0; j < out_cells_j; j++)
-      {
-        *(outptr_row + j*out_col_stride) = v[i][j];
-      }
-    }
-    outptr_base++;
+  Derived * dthis = static_cast<Derived *>(this);
+
+  switch(_activation)
+  {
+    case ActivationFunction::ReLU:
+      dthis->template execute_tile<ActivationFunction::ReLU>(
+        n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+      );
+      break;
+    case ActivationFunction::ReLU6:
+      dthis->template execute_tile<ActivationFunction::ReLU6>(
+        n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+      );
+      break;
+    default:
+      dthis->template execute_tile<ActivationFunction::None>(
+        n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride
+      );
+      break;
   }
+
+  if (pad_output)
+  {
+    // Copy the output from the temporary buffer, removing unnecessary values
+    padding::CopyCropped<OutputTileRows, OutputTileColumns>::execute(
+      n_channels * sizeof(TOut),
+      _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut),
+      outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut),
+      0, 0, pad_out_bottom, pad_out_right
+    );
+  }
+}
+
+MEMBERFN(int)::n_channels(void) const
+{
+  return _n_channels;
 }
 
 }  // namespace depthwise