aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-03-11 14:03:23 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-03-29 09:54:53 +0000
commit47d39dc615d1dee2482bc84699802165a9778ac8 (patch)
tree87f2fdb4f4957be7ff1c043be6328e4154cdf9e1 /arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
parent2d2551ed3934f071eb6a65f5b776301454bc147a (diff)
downloadComputeLibrary-47d39dc615d1dee2482bc84699802165a9778ac8.tar.gz
COMPMID-1975: Update depthwise convolution.
Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/898 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp')
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp702
1 files changed, 364 insertions, 338 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index 6d9cb18f44..45e8da0272 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -24,42 +24,84 @@
#pragma once
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+
namespace depthwise
{
+namespace nck = neon_convolution_kernels;
+
class IDepthwiseConvolution
{
public:
virtual ~IDepthwiseConvolution() = default;
- virtual int output_size(const int dim_size, const bool padding_same) const = 0;
+
virtual int output_size(
int dim_size,
unsigned int padding_before,
unsigned int padding_after
) const = 0;
+ /* Set input tensor and stride. */
+ virtual void set_input(const void *inptr) = 0;
+ virtual void set_input(const void *inptr, int column_stride) = 0;
+ virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
+ virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+ /* Set output tensor and stride. */
+ virtual void set_output(void *outptr) = 0;
+ virtual void set_output(void *outptr, int column_stride) = 0;
+ virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
+ virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+ /* Weights and biases are re-ordered to improve memory access patterns. Use
+ * these methods to determine the size of the re-pack buffer and to set the
+ * address (and implicitly reorder the weights and biases into) the buffer.
+ */
+ virtual size_t get_packed_params_size(void) const = 0;
+ virtual void set_packed_params_buffer(void *) = 0;
+
+ virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
+ virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
+ virtual void pack_params(
+ void *buffer,
+ const void* weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const = 0;
+
+ /* Working space is used to pad tensors on the fly. Before running any
+ * inference check the amount of space required, allocate and provide a
+ * pointer to the convolution engine.
+ */
+ virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
+ virtual void set_working_space(void *) = 0;
+
virtual unsigned int get_window(void) const = 0;
- virtual void set_offsets(int input_offset, int weights_offset) = 0;
- virtual void run(const unsigned int start, const unsigned int stop) = 0;
+ virtual void run(
+ unsigned int start,
+ unsigned int stop,
+ unsigned int threadid=0
+ ) = 0;
};
template <
- int OutputTileRows,
- int OutputTileCols,
- int KernelRows,
- int KernelCols,
- int StrideRows,
- int StrideCols,
- typename TIn,
- typename TOut
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename TIn, typename TBias, typename TOut,
+ typename Derived
>
-class DepthwiseConvolution : public IDepthwiseConvolution
+class DepthwiseConvolutionBase : public IDepthwiseConvolution
{
public:
- typedef TIn InputType;
- typedef TOut OutputType;
-
// Information about the specific convolution instance
+ using InputType = TIn;
+ using BiasType = TBias;
+ using OutputType = TOut;
static constexpr int output_tile_rows = OutputTileRows;
static constexpr int output_tile_cols = OutputTileCols;
static constexpr int kernel_rows = KernelRows;
@@ -71,260 +113,84 @@ class DepthwiseConvolution : public IDepthwiseConvolution
/** Create a new depthwise convolution engine.
*
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_same True if padding is SAME, else VALID.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels, bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
- weights, input, output, 0 /* column stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_top Padding to apply to top of input.
- * @param[in] padding_left Padding to apply to left of input.
- * @param[in] padding_bottom Padding to apply to bottom of input.
- * @param[in] padding_right Padding to apply to right of input.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- padding_top, padding_left, padding_bottom, padding_right,
- weights, input, output, 0 /* column stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine with a specified column stride.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_same True if padding is SAME, else VALID.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] col_stride Stride between columns of the weights, inputs and output tensors.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels, bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- const int col_stride
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
- weights, input, output,
- col_stride, 0, /* Weight row stride = default */
- col_stride, 0, 0, /* Input row stride, batch stride = default */
- col_stride, 0, 0 /* Output row stride, batch stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine with a specified column stride.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_top Padding to apply to top of input.
- * @param[in] padding_left Padding to apply to left of input.
- * @param[in] padding_bottom Padding to apply to bottom of input.
- * @param[in] padding_right Padding to apply to right of input.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] col_stride Stride between columns of the weights, inputs and output tensors.
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels,
- unsigned int padding_top,
- unsigned int padding_left,
- unsigned int padding_bottom,
- unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- const int col_stride
- ) : DepthwiseConvolution(
- n_batches, n_input_rows, n_input_cols, n_channels,
- padding_top, padding_left, padding_bottom, padding_right,
- weights, input, output,
- col_stride, 0, /* Weight row stride = default */
- col_stride, 0, 0, /* Input row stride, batch stride = default */
- col_stride, 0, 0 /* Output row stride, batch stride = default */
- )
- {
- }
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_same True if padding is SAME, else VALID.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
- * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
- * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
- * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
- * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
- * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
- * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
- * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
- */
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels, bool padding_same,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- int weight_col_stride,
- int weight_row_stride,
- int input_col_stride,
- int input_row_stride,
- int input_batch_stride,
- int output_col_stride,
- int output_row_stride,
- int output_batch_stride
- );
-
- /** Create a new depthwise convolution engine.
- *
- * @param[in] n_batches Number of batches tensors.
- * @param[in] n_input_rows Number of rows in input tensor.
- * @param[in] n_input_cols Number of columns in input tensor.
- * @param[in] n_channels Number of channels in input and output tensors.
- * @param[in] padding_top Padding to apply to top of input.
- * @param[in] padding_left Padding to apply to left of input.
- * @param[in] padding_bottom Padding to apply to bottom of input.
- * @param[in] padding_right Padding to apply to right of input.
- * @param[in] weights Pointer to Height x Width x Channel ordered weights.
- * @param[in] input Pointer to NHWC ordered input tensor.
- * @param[out] output Pointer to NHWC ordered output tensor.
- * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
- * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
- * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
- * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
- * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
- * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
- * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
- * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
+ * @param[in] n_batches Number of batches tensors.
+ * @param[in] n_input_rows Number of rows in input tensor.
+ * @param[in] n_input_cols Number of columns in input tensor.
+ * @param[in] n_channels Number of channels in input and output tensors.
*/
- DepthwiseConvolution(
- int n_batches, int n_input_rows, int n_input_cols,
- int n_channels,
+ DepthwiseConvolutionBase(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
unsigned int padding_top,
unsigned int padding_left,
unsigned int padding_bottom,
- unsigned int padding_right,
- const TIn* const weights,
- const TIn* const input,
- TOut* const output,
- int weight_col_stride,
- int weight_row_stride,
- int input_col_stride,
- int input_row_stride,
- int input_batch_stride,
- int output_col_stride,
- int output_row_stride,
- int output_batch_stride
+ unsigned int padding_right
);
// Cannot copy or move a DepthwiseConvolution.
- DepthwiseConvolution(DepthwiseConvolution&) = delete;
- DepthwiseConvolution operator=(DepthwiseConvolution&) = delete;
+ DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
+ DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
+
+ /* Set input tensor and stride. */
+ void set_input(const void *inptr) override;
+ void set_input(const void *inptr, int column_stride) override;
+ void set_input(const void *inptr, int row_stride, int column_stride) override;
+ void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+ /* Set output tensor and stride. */
+ void set_output(void *outptr) override;
+ void set_output(void *outptr, int column_stride) override;
+ void set_output(void *outptr, int row_stride, int column_stride) override;
+ void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
/** Get the number of output rows/columns.
*
* @param[in] dim_size Number of elements in the dimension (rows/columns)
* @param[in] same_padding True if the padding is SAME, otherwise false.
*/
- static int get_output_size(int dim_size, bool padding_same);
static int get_output_size(
- int dim_size,
- unsigned int padding_before,
- unsigned int padding_after
+ int dim_size, unsigned int padding_before, unsigned int padding_after
);
- /** Get the number of output rows/columns.
- *
- * @param[in] dim_size Number of elements in the dimension (rows/columns)
- * @param[in] same_padding True if the padding is SAME, otherwise false.
+ int output_size(
+ int dim_size, unsigned int padding_before, unsigned int padding_after
+ ) const override;
+
+ /* Determine how much memory is required to store the packed weights and
+ * biases.
*/
- int output_size(int dim_size, bool padding_same) const override
- {
- return DepthwiseConvolution<
- OutputTileRows,
- OutputTileCols,
- KernelRows,
- KernelCols,
- StrideRows,
- StrideCols,
- TIn, TOut
- >::get_output_size(dim_size, padding_same);
- }
+ size_t get_packed_params_size(void) const override;
- int output_size(
- int dim_size,
- unsigned int padding_before,
- unsigned int padding_after
- ) const override
- {
- return DepthwiseConvolution<
- OutputTileRows,
- OutputTileCols,
- KernelRows,
- KernelCols,
- StrideRows,
- StrideCols,
- TIn, TOut
- >::get_output_size(dim_size, padding_before, padding_after);
- }
-
- /** Sets quantization offsets
- *
- * @param[in] input_offset Input offset
- * @param[in] weights_offset Weights offset
+ /* Set the buffer for the packed weights and biases, and perform the
+ * packing.
+ */
+ void set_packed_params_buffer(void *buffer) override;
+
+ void pack_params(const void *weights, const void *biases=nullptr) const override;
+
+ void pack_params(
+ void *buffer,
+ const void *weights,
+ const void *biases=nullptr
+ ) const override;
+
+ void pack_params(
+ void *buffer,
+ const void *weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const override;
+
+ /** Query the amount of working space required.
+ * @param[in] The largest number of threads which will be used to execute
+ * the kernel.
+ */
+ size_t get_working_space_size(unsigned int n_threads=1) const override;
+
+ /** Set the working space buffer.
*/
- void set_offsets(int input_offset, int weights_offset) override;
+ void set_working_space(void *buffer) override;
/** Get the window of work to be performed by an instance of the operator.
*/
@@ -336,122 +202,282 @@ class DepthwiseConvolution : public IDepthwiseConvolution
*
* @param[in] start Start of the window of work to perform.
* @param[in] stop End of the work to perform.
+ * @param[in] ID of the thread performing the work.
*/
- void run(unsigned int start, unsigned int stop) override;
+ void run(
+ unsigned int start,
+ unsigned int stop,
+ unsigned int threadid=0
+ ) override;
protected:
+ /** Get the value to use to pad the tensor.
+ */
+ TIn _input_padding_value(void) const;
+
+ /** Implementation of the parameter packing.
+ */
+ void _pack_params(
+ void *buffer,
+ const void *weights,
+ unsigned int weight_row_stride,
+ unsigned int weight_col_stride,
+ const void *biases=nullptr
+ ) const;
+
/** Process a tile-row of the tensors.
*/
- static void process_tile_row(
+ void process_tile_row(
+ unsigned int threadid,
int n_channels,
- const TIn* const weights,
- const int weight_row_stride,
- const int weight_col_stride,
- const TIn* const inptr,
- int in_row_stride,
- int in_col_stride,
- TOut* const outptr,
- int out_row_stride,
- int out_col_stride,
+ const void* packed_params,
+ const InputType* inptr,
+ OutputType* outptr,
int row_pad_in_top,
int row_pad_in_left,
int row_pad_in_bottom,
int row_pad_out_bottom,
int n_tiles,
int n_input_cols,
- int n_output_cols,
- int input_offset,
- int weights_offset
+ int n_output_cols
);
- // Determine the maximum (and minimum) padding values which can be applied
- // to tiles of the tensors involved in this class of convolution.
- static constexpr int max_in_pad_top = (kernel_rows - 1) / 2;
- static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2;
-
- static constexpr int max_in_pad_left = (kernel_cols - 1) / 2;
- static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2;
-
- static constexpr int max_in_pad_bottom = inner_tile_rows;
- static constexpr int max_in_pad_right = inner_tile_cols;
- static constexpr int max_out_pad_bottom = output_tile_rows;
- static constexpr int max_out_pad_right = output_tile_cols;
-
- static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1;
- static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1;
- static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1;
- static constexpr int n_in_pad_right_fns = max_in_pad_right + 1;
- static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1;
- static constexpr int n_out_pad_right_fns = max_out_pad_right + 1;
-
- /** Pointer to a function which will process a tile.
+ /** Process a single tile of the tensor.
*
- * @param[in] n_channels Number of channels.
- * @param[in] weights Pointer to Height x Width x Channels ordered weights.
- * @param[in] inptr Pointer to the top-left unpadded value of the tile.
- * @param[in] in_row_stride Stride between rows of the input tensor.
- * @param[in] in_col_stride Stride between columns of the input tensor.
- * @param[out] outptr Pointer to the top-left output value for the tile.
- * @param[in] out_row_stride Stride between rows of the output tensor.
- * @param[in] out_col_stride Stride between columns of the output tensor.
- *
- * The following parameters may be ignored if the function has been
- * specialised for specific padding constraints.
- *
- * @param[in] _in_pad_top Padding to apply to top of input tile.
- * @param[in] _in_pad_left Padding to apply to left of input tile.
- * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
- * @param[in] _in_pad_right Padding to apply to right of input tile.
- * @param[in] _out_pad_bottom Null cells at bottom of output tile.
- * @param[in] _out_pad_right Null cells at right of output tile.
+ * This method will apply input/output padding (if required) and call the
+ * depthwise tile implementation.
*/
- typedef void (*TileFn)(
+ void process_tile(
+ unsigned int threadid,
int n_channels,
- const TIn* const weights,
- int weight_row_stride,
- int weight_col_stride,
- const TIn* const inptr,
- int in_row_stride,
- int in_col_stride,
- TOut* const outptr,
- int out_row_stride,
- int out_col_stride,
- int _in_pad_top,
- int _in_pad_left,
- int _in_pad_bottom,
- int _in_pad_right,
- int _out_pad_bottom,
- int _out_pad_right,
- int _input_offset,
- int _weights_offset
+ const void* packed_params,
+ const InputType* inptr,
+ OutputType* outptr,
+ int pad_in_top,
+ int pad_in_left,
+ int pad_in_bottom,
+ int pad_in_right,
+ int pad_out_bottom,
+ int pad_out_right
);
- /* Arrays of methods to process tensor tiles.
- *
- * Allows dynamic dispatch to specialized implementations based on
- * different padding configurations.
+ /** Perform depthwise convolution on a single tile.
*/
- static const TileFn tilefn_unpadded;
- static const TileFn tilefn_top[n_in_pad_top_fns];
- static const TileFn tilefn_left[n_in_pad_left_fns];
- static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns];
- static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns];
- static const TileFn tilefn_generic;
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const InputType* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ OutputType* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+
+ int n_channels(void) const;
private:
// Member variables of instances of a convolution engine.
- const TIn* const _weights;
- const TIn* const _input;
- TOut* const _output;
+ const InputType* _input;
+ OutputType* _output;
+ void* _packed_parameters;
+ void* _working_space; // Per-thread working space
const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
_n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
+ const nck::ActivationFunction _activation;
// Stride information for a convolution instance
- const int _weight_col_stride, _weight_row_stride;
- const int _input_col_stride, _input_row_stride, _input_batch_stride;
- const int _output_col_stride, _output_row_stride, _output_batch_stride;
- int _input_offset, _weights_offset;
+ int _input_col_stride, _input_row_stride, _input_batch_stride;
+ const int _input_ws_col_stride, _input_ws_row_stride;
+ int _output_col_stride, _output_row_stride, _output_batch_stride;
+ const int _output_ws_col_stride, _output_ws_row_stride;
+
+ // Methods for getting access to working space
+ size_t _get_input_working_space_size(void) const;
+ size_t _get_output_working_space_size(void) const;
+
+ void *_get_input_working_space(unsigned int threadid) const;
+ void *_get_output_working_space(unsigned int threadid) const;
};
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols,
+ typename TIn, typename TBias, typename TOut
+>
+class DepthwiseConvolution : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut
+ >
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ TIn, TBias, TOut
+ > >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ using Base::DepthwiseConvolutionBase;
+
+ protected:
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const TIn* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ TOut* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+};
+
+
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float
+> : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float
+ >
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float, float
+ > >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ protected:
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const float* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ float* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <
+ unsigned int OutputTileRows, unsigned int OutputTileCols,
+ unsigned int KernelRows, unsigned int KernelCols,
+ unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+> : public DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+ >
+>
+{
+ using Base = DepthwiseConvolutionBase<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t,
+ DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t, float16_t
+ > >;
+ friend Base;
+ using InputType = typename Base::InputType;
+ using OutputType = typename Base::OutputType;
+
+ public:
+ DepthwiseConvolution(
+ int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+ nck::ActivationFunction activation,
+ unsigned int padding_top,
+ unsigned int padding_left,
+ unsigned int padding_bottom,
+ unsigned int padding_right
+ );
+
+ protected:
+ template <nck::ActivationFunction Activation>
+ void execute_tile(
+ int n_channels,
+ const void* packed_params,
+ const float16_t* inptr,
+ unsigned int in_row_stride,
+ unsigned int in_col_stride,
+ float16_t* outptr,
+ unsigned int out_row_stride,
+ unsigned int out_col_stride
+ );
+};
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
} // namespace depthwise