diff options
Diffstat (limited to 'arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp')
-rw-r--r-- | arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp | 628 |
1 files changed, 274 insertions, 354 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp index b33f2768ad..674fc4d2df 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp @@ -31,101 +31,73 @@ */ #include <algorithm> +#include <cstdint> #include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp" #pragma once -namespace depthwise -{ +#define MEMBERFN(TOUT) template <\ + unsigned int OutputTileRows, unsigned int OutputTileColumns,\ + unsigned int KernelRows, unsigned int KernelColumns,\ + unsigned int StrideRows, unsigned int StrideColumns,\ + typename TIn, typename TBias, typename TOut,\ + typename Derived\ +> TOUT DepthwiseConvolutionBase<\ + OutputTileRows, OutputTileColumns,\ + KernelRows, KernelColumns,\ + StrideRows, StrideColumns,\ + TIn, TBias, TOut, Derived\ +> -const unsigned int CHANNEL_BLOCK = 16; +using namespace neon_convolution_kernels; -namespace +namespace depthwise { - inline int pad_along_dim( - const bool padding_same, - const int kernel_dim, - const int stride_dim, - const int input_dim - ) - { - if (!padding_same) - return 0; - if (input_dim % stride_dim) - return std::max(kernel_dim - (input_dim % stride_dim), 0); - else - return std::max(kernel_dim - stride_dim, 0); - } -} // namespace -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size( - const int dim_size, const bool same_padding -) +template <unsigned int KernelRows, unsigned int KernelColumns, size_t WeightSize, size_t BiasSize> +struct PackParameters { - return iceildiv(dim_size - (same_padding ? 0 : (KC - 1)), SR); -} + static void execute( + unsigned int n_channels, + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases + ); +}; -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_output_size( +const unsigned int CHANNEL_BLOCK = 16; + +MEMBERFN(int)::get_output_size( const int dim_size, const unsigned int padding_before, const unsigned int padding_after ) { - return iceildiv(dim_size + padding_before + padding_after - KR + 1, SR); + return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows); } -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution( - const int n_batches, const int n_input_rows, const int n_input_cols, - const int n_channels, const bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int weight_col_stride, - const int weight_row_stride, - const int input_col_stride, - const int input_row_stride, - const int input_batch_stride, - const int output_col_stride, - const int output_row_stride, - const int output_batch_stride -) : DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>( - n_batches, n_input_rows, n_input_cols, - n_channels, - pad_along_dim(padding_same, KR, SR, n_input_rows) / 2, /* top padding */ - pad_along_dim(padding_same, KC, SC, n_input_cols) / 2, /* left padding */ - iceildiv(pad_along_dim(padding_same, KR, SR, n_input_rows), 2), /* bottom padding */ - iceildiv(pad_along_dim(padding_same, KC, SC, n_input_cols), 2), /* right padding */ - weights, input, output, - weight_col_stride, weight_row_stride, - input_col_stride, input_row_stride, input_batch_stride, - output_col_stride, output_row_stride, output_batch_stride -) +MEMBERFN(int)::output_size( + const int dim_size, const unsigned int padding_before, const unsigned int padding_after +) const { + return get_output_size(dim_size, padding_before, padding_after); } - -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution( - const int n_batches, const int n_input_rows, const int n_input_cols, +MEMBERFN()::DepthwiseConvolutionBase( + const int n_batches, + const int n_input_rows, + const int n_input_cols, const int n_channels, + ActivationFunction activation, const unsigned int padding_top, const unsigned int padding_left, const unsigned int padding_bottom, - const unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int weight_col_stride, - const int weight_row_stride, - const int input_col_stride, - const int input_row_stride, - const int input_batch_stride, - const int output_col_stride, - const int output_row_stride, - const int output_batch_stride -) : _weights(weights), _input(input), _output(output), + const unsigned int padding_right +) : _input(nullptr), _output(nullptr), + _packed_parameters(nullptr), + _working_space(nullptr), _n_batches(n_batches), _n_input_rows(n_input_rows), _n_input_cols(n_input_cols), @@ -138,37 +110,157 @@ DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution( _padding_left(padding_left), _padding_bottom(padding_bottom), _padding_right(padding_right), - _weight_col_stride(weight_col_stride ? weight_col_stride : _n_channels), - _weight_row_stride(weight_row_stride ? weight_row_stride : KC * _weight_col_stride), - _input_col_stride(input_col_stride ? input_col_stride : _n_channels), - _input_row_stride(input_row_stride ? input_row_stride : _n_input_cols * _input_col_stride), - _input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride), - _output_col_stride(output_col_stride ? output_col_stride : _n_channels), - _output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride), - _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride), - _input_offset(0), _weights_offset(0) + _activation(activation), + _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0), + _input_ws_col_stride(_n_channels), + _input_ws_row_stride(_input_ws_col_stride * inner_tile_cols), + _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0), + _output_ws_col_stride(_n_channels), + _output_ws_row_stride(_output_ws_col_stride * OutputTileColumns) { } +MEMBERFN(void)::set_input(const void* const inptr) +{ + set_input(inptr, _n_channels); +} -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_window() const +MEMBERFN(void)::set_input(const void* const inptr, const int ld_col) { - // Parallelise over blocks of channels. - return iceildiv(_n_channels, CHANNEL_BLOCK); + set_input(inptr, _n_input_cols * ld_col, ld_col); +} + +MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col) +{ + set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col); +} + +MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col) +{ + _input = static_cast<const TIn *>(inptr); + _input_batch_stride = ld_batch; + _input_row_stride = ld_row; + _input_col_stride = ld_col; +} + +MEMBERFN(void)::set_output(void* const outptr) +{ + set_output(outptr, _n_channels); +} + +MEMBERFN(void)::set_output(void* const outptr, const int ld_col) +{ + set_output(outptr, _n_output_cols * ld_col, ld_col); +} + +MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col) +{ + set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col); +} + +MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col) +{ + _output = static_cast<TOut *>(outptr); + _output_batch_stride = ld_batch; + _output_row_stride = ld_row; + _output_col_stride = ld_col; +} + +MEMBERFN(size_t)::get_packed_params_size(void) const +{ + return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); +} + +MEMBERFN(void)::set_packed_params_buffer(void *buffer) +{ + _packed_parameters = buffer; +} + +MEMBERFN(void)::pack_params(const void *weights, const void *biases) const +{ + static_cast<const Derived *>(this)->pack_params(_packed_parameters, weights, biases); +} + +MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const +{ + const unsigned int weight_col_stride = _n_channels; + const unsigned int weight_row_stride = KernelColumns * weight_col_stride; + static_cast<const Derived *>(this)->pack_params( + buffer, weights, weight_row_stride, weight_col_stride, biases + ); +} + +MEMBERFN(void)::pack_params( + void * const buffer, + const void * const weights, + const unsigned int weight_row_stride, + const unsigned int weight_col_stride, + const void * const biases +) const +{ + static_cast<const Derived *>(this)->_pack_params( + buffer, weights, weight_row_stride, weight_col_stride, biases + ); +} + +MEMBERFN(void)::_pack_params( + void * const buffer, + const void * const weights, + const unsigned int weight_row_stride, + const unsigned int weight_col_stride, + const void * const biases +) const +{ + // Default implementation + PackParameters<KernelRows, KernelColumns, sizeof(TIn), sizeof(TOut)>::execute( + _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases + ); +} + +MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const +{ + return nthreads * ( + _get_input_working_space_size() + _get_output_working_space_size() + ); } -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset) +MEMBERFN(void)::set_working_space(void *buffer) { - _input_offset = input_offset; - _weights_offset = weights_offset; + _working_space = buffer; } -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run( +MEMBERFN(size_t)::_get_input_working_space_size(void) const +{ + return sizeof(TIn) * inner_tile_rows * inner_tile_cols * _n_channels; +} + +MEMBERFN(size_t)::_get_output_working_space_size(void) const +{ + return sizeof(TOut) * OutputTileRows * OutputTileColumns * _n_channels; +} + +MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const +{ + return static_cast<uint8_t*>(_working_space) + threadid * ( + _get_input_working_space_size() + _get_output_working_space_size() + ); +} + +MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const +{ + return static_cast<uint8_t*>(_get_input_working_space(threadid)) + _get_input_working_space_size(); +} + +MEMBERFN(unsigned int)::get_window() const +{ + // Parallelise over blocks of channels. + return iceildiv(_n_channels, CHANNEL_BLOCK); +} + +MEMBERFN(void)::run( const unsigned int start, - const unsigned int stop + const unsigned int stop, + const unsigned int threadid ) { // Parallelise over blocks of channels @@ -205,43 +297,38 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run( const int output_row_bottom = (tile_i + 1)*output_tile_rows; const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows); + // Get the offset into the packed parameters + const auto params_ptr = static_cast<const uint8_t*>(_packed_parameters) + + start_channel*(sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); + // Process the row process_tile_row( + threadid, stop_channel - start_channel, - _weights + start_channel, _weight_row_stride, _weight_col_stride, - inptr_row + start_channel, _input_row_stride, _input_col_stride, - outptr_row + start_channel, _output_row_stride, _output_col_stride, + params_ptr, + inptr_row + start_channel, + outptr_row + start_channel, input_row_pad_top, input_pad_left, input_row_pad_bottom, output_row_pad_bottom, - _n_tile_cols, _n_input_cols, _n_output_cols, - _input_offset, _weights_offset + _n_tile_cols, _n_input_cols, _n_output_cols ); } } } - -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row( +MEMBERFN(void)::process_tile_row( + const unsigned int threadid, const int n_channels, - const TIn* const weights, - const int weight_row_stride, - const int weight_col_stride, + const void* const packed_params, const TIn* const inptr, - const int in_row_stride, - const int in_col_stride, TOut* const outptr, - const int out_row_stride, - const int out_col_stride, const int row_pad_in_top, const int row_pad_in_left, const int row_pad_in_bottom, const int row_pad_out_bottom, const int n_tiles, const int n_input_cols, - const int n_output_cols, - const int input_offset, - const int weights_offset + const int n_output_cols ) { constexpr int tile_overlap = kernel_cols - stride_cols; @@ -261,264 +348,97 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row // Get pointers into the inputs and outputs const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left; - const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*in_col_stride); - TOut* const outptr_col = outptr + tile_j * output_tile_cols * out_col_stride; - - // Apply the specific tile processing function - const bool pad_top = row_pad_in_top > 0; - const bool pad_left = t_pad_in_left > 0; - const bool pad_bottom = row_pad_in_bottom || row_pad_out_bottom; - const bool pad_right = t_pad_in_right || t_pad_out_right; - - const TileFn tilefn = [&] () { - if (!pad_top && !pad_left && !pad_bottom && !pad_right) - { - // No padding - return tilefn_unpadded; - } - else if (pad_top && !pad_left && !pad_bottom && !pad_right) - { - // Padding on the top only, subtract off the minimum expected padding in - // order to index into the array of specialised methods. - const int index = row_pad_in_top - min_in_pad_top; - return tilefn_top[index]; - } - else if (!pad_top && pad_left && !pad_bottom && !pad_right) - { - // Padding on the left only, subtract off the minimum expected padding in - // order to index into the array of specialised methods. - const int index = t_pad_in_left - min_in_pad_left; - return tilefn_left[index]; - } - else if (!pad_top && !pad_left && pad_bottom && !pad_right) - { - // Padding on the bottom only - return tilefn_bottom[row_pad_in_bottom][row_pad_out_bottom]; - } - else if (!pad_top && !pad_left && !pad_bottom && pad_right) - { - // Padding on the right only - return tilefn_right[t_pad_in_right][t_pad_out_right]; - } - else - { - // Otherwise use generic tile processing method. - return tilefn_generic; - } - }(); - - tilefn( - n_channels, - weights, weight_row_stride, weight_col_stride, - inptr_col, in_row_stride, in_col_stride, - outptr_col, out_row_stride, out_col_stride, - row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, - row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset + const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride); + TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride; + + // Process just this tile + process_tile( + threadid, n_channels, packed_params, inptr_col, outptr_col, + row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings + row_pad_out_bottom, t_pad_out_right // Output paddings ); } } - -// New templated struct used solely as a way to provide tile processing -// specialisations. -template <int OutputTileRows, int OutputTileCols, - int KernelRows, int KernelCols, - int StrideRows, int StrideCols, - typename TIn, typename TOut> -struct DepthwiseConvolutionImpl : public DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, TIn, TOut -> +MEMBERFN(TIn)::_input_padding_value(void) const { - typedef DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TOut - > DWC; - - /** Perform the depthwise convolution of a tile. - * - * @param[in] n_channels Number of channels. - * @param[in] weights Pointer to Height x Width x Channels ordered weights. - * @param[in] inptr Pointer to the top-left unpadded value of the tile. - * @param[in] in_row_stride Stride between rows of the input tensor. - * @param[in] in_col_stride Stride between columns of the input tensor. - * @param[out] outptr Pointer to the top-left output value for the tile. - * @param[in] out_row_stride Stride between rows of the output tensor. - * @param[in] out_col_stride Stride between columns of the output tensor. - * - * The following parameters may be ignored if the function has been - * specialised for specific padding constraints. - * - * @param[in] _in_pad_top Padding to apply to top of input tile. - * @param[in] _in_pad_left Padding to apply to left of input tile. - * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. - * @param[in] _in_pad_right Padding to apply to right of input tile. - * @param[in] _out_pad_bottom Null cells at bottom of output tile. - * @param[in] _out_pad_right Null cells at right of output tile. - */ - template < - bool Specialize=false, // Specialize (or not) the method - int InPadTop=0, // If specialized, top padding - int InPadLeft=0, // If specialized, left padding - int InPadBottom=0, // If specialized, bottom padding - int InPadRight=0, // If specialized, right padding - int OutPadBottom=0, // If specialized, bottom output padding - int OutPadRight=0 // If specialized, bottom right padding - > - static void process_tile( - const int n_channels, - const TIn* const weights, - const int weight_row_stride, - const int weight_col_stride, - const TIn* const inptr, - const int in_row_stride, - const int in_col_stride, - TOut* const outptr, - const int out_row_stride, - const int out_col_stride, - const int in_pad_top=0, - const int in_pad_left=0, - const int in_pad_bottom=0, - const int in_pad_right=0, - const int out_pad_bottom=0, - const int out_pad_right=0, - const int input_offset=0, - const int weights_offset=0 - ); -}; - + return static_cast<TIn>(0); +} -template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut> -template < - bool Specialize, - int InPadTop, int InPadLeft, int InPadBottom, int InPadRight, - int OutPadBottom, int OutPadRight -> -void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile( +MEMBERFN(void)::process_tile( + const unsigned int threadid, const int n_channels, - const TIn *__restrict__ const weights, - const int weight_row_stride, - const int weight_col_stride, - const TIn *__restrict__ const inptr, - const int in_row_stride, - const int in_col_stride, - TOut *__restrict__ const outptr, - const int out_row_stride, - const int out_col_stride, - const int _in_pad_top, - const int _in_pad_left, - const int _in_pad_bottom, - const int _in_pad_right, - const int _out_pad_bottom, - const int _out_pad_right, - const int _input_offset, - const int _weights_offset + const void* const packed_params, + const TIn* const inptr, + TOut* const outptr, + const int pad_in_top, + const int pad_in_left, + const int pad_in_bottom, + const int pad_in_right, + const int pad_out_bottom, + const int pad_out_right ) { - constexpr auto inner_tile_rows = DWC::inner_tile_rows; - constexpr auto inner_tile_cols = DWC::inner_tile_cols; - constexpr auto kernel_rows = DWC::kernel_rows; - constexpr auto kernel_cols = DWC::kernel_cols; - constexpr auto output_tile_rows = DWC::output_tile_rows; - constexpr auto output_tile_cols = DWC::output_tile_cols; - constexpr auto stride_rows = DWC::stride_rows; - constexpr auto stride_cols = DWC::stride_cols; - - // Extract parameters - const int in_pad_top = Specialize ? InPadTop : _in_pad_top; - const int in_pad_left = Specialize ? InPadLeft : _in_pad_left; - const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom; - const int in_pad_right = Specialize ? InPadRight : _in_pad_right; - const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom; - const int out_pad_right = Specialize ? OutPadRight : _out_pad_right; - - // Compute valid ranges of the tile - const int in_cells_i = inner_tile_rows - in_pad_bottom; - const int in_cells_j = inner_tile_cols - in_pad_right; - const int out_cells_i = output_tile_rows - out_pad_bottom; - const int out_cells_j = output_tile_cols - out_pad_right; - - // Instantiate pointers - const TIn* __restrict__ inptr_base = inptr; - const TIn* __restrict__ wptr_base = weights; - TOut* __restrict__ outptr_base = outptr; - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining; channels_remaining--) + const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right; + const bool pad_output = pad_out_bottom || pad_out_right; + + if (pad_input) { - // Load input tile - TIn u[inner_tile_rows][inner_tile_cols]; - for (int i = 0; i < inner_tile_rows; i++) - { - const TIn* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride; - for (int j = 0; j < inner_tile_cols; j++) - { - if (i < in_pad_top || in_cells_i <= i || - j < in_pad_left || in_cells_j <= j) - { - u[i][j] = static_cast<TIn>(0); - } - else - { - u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride); - } - } - } - inptr_base++; + // Copy the input into the temporary buffer, applying padding + padding::copy_and_pad_tile<TIn>( + inner_tile_rows, inner_tile_cols, n_channels, + inptr, _input_row_stride, _input_col_stride, + static_cast<TIn *>(_get_input_working_space(threadid)), _input_ws_row_stride, _input_ws_col_stride, + pad_in_top, pad_in_left, pad_in_bottom, pad_in_right, + static_cast<Derived *>(this)->_input_padding_value() + ); + } - // Load weights tile - TIn w[kernel_rows][kernel_cols]; - for (int i = 0; i < kernel_rows; i++) - { - const TIn* const wptr_row = wptr_base + i*weight_row_stride; - for (int j = 0; j < kernel_cols; j++) - { - w[i][j] = *(wptr_row + j*weight_col_stride); - } - } - wptr_base++; + // Execute the kernel + const TIn * const tile_inptr = !pad_input ? inptr : static_cast<const TIn *>(_get_input_working_space(threadid)); + const int in_row_stride = !pad_input ? _input_row_stride : _input_ws_row_stride; + const int in_col_stride = !pad_input ? _input_col_stride : _input_ws_col_stride; - // Perform the convolution - TOut v[output_tile_rows][output_tile_cols]; - for (int out_i = 0; out_i < out_cells_i; out_i++) - { - for (int out_j = 0; out_j < out_cells_j; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = static_cast<TOut>(0); - - // Base co-ordinate - const int base_i = out_i * stride_rows; - const int base_j = out_j * stride_cols; - - // Fill the accumulator - for (int in_i = 0; in_i < kernel_rows; in_i++) - { - const int i = base_i + in_i; - for (int in_j = 0; in_j < kernel_cols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - } - } + TOut * const tile_outptr = !pad_output ? outptr : static_cast<TOut *>(_get_output_working_space(threadid)); + const int out_row_stride = !pad_output ? _output_row_stride : _output_ws_row_stride; + const int out_col_stride = !pad_output ? _output_col_stride : _output_ws_col_stride; - // Store the output tile - for (int i = 0; i < out_cells_i; i++) - { - TOut* __restrict__ const outptr_row = outptr_base + i*out_row_stride; - for (int j = 0; j < out_cells_j; j++) - { - *(outptr_row + j*out_col_stride) = v[i][j]; - } - } - outptr_base++; + Derived * dthis = static_cast<Derived *>(this); + + switch(_activation) + { + case ActivationFunction::ReLU: + dthis->template execute_tile<ActivationFunction::ReLU>( + n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride + ); + break; + case ActivationFunction::ReLU6: + dthis->template execute_tile<ActivationFunction::ReLU6>( + n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride + ); + break; + default: + dthis->template execute_tile<ActivationFunction::None>( + n_channels, packed_params, tile_inptr, in_row_stride, in_col_stride, tile_outptr, out_row_stride, out_col_stride + ); + break; } + + if (pad_output) + { + // Copy the output from the temporary buffer, removing unnecessary values + padding::CopyCropped<OutputTileRows, OutputTileColumns>::execute( + n_channels * sizeof(TOut), + _get_output_working_space(threadid), _output_ws_row_stride * sizeof(TOut), _output_ws_col_stride * sizeof(TOut), + outptr, _output_row_stride * sizeof(TOut), _output_col_stride * sizeof(TOut), + 0, 0, pad_out_bottom, pad_out_right + ); + } +} + +MEMBERFN(int)::n_channels(void) const +{ + return _n_channels; } } // namespace depthwise |