diff options
Diffstat (limited to 'arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp')
-rw-r--r-- | arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp | 702 |
1 files changed, 364 insertions, 338 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp index 6d9cb18f44..45e8da0272 100644 --- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp +++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp @@ -24,42 +24,84 @@ #pragma once +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp" + namespace depthwise { +namespace nck = neon_convolution_kernels; + class IDepthwiseConvolution { public: virtual ~IDepthwiseConvolution() = default; - virtual int output_size(const int dim_size, const bool padding_same) const = 0; + virtual int output_size( int dim_size, unsigned int padding_before, unsigned int padding_after ) const = 0; + /* Set input tensor and stride. */ + virtual void set_input(const void *inptr) = 0; + virtual void set_input(const void *inptr, int column_stride) = 0; + virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0; + virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0; + + /* Set output tensor and stride. */ + virtual void set_output(void *outptr) = 0; + virtual void set_output(void *outptr, int column_stride) = 0; + virtual void set_output(void *outptr, int row_stride, int column_stride) = 0; + virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0; + + /* Weights and biases are re-ordered to improve memory access patterns. Use + * these methods to determine the size of the re-pack buffer and to set the + * address (and implicitly reorder the weights and biases into) the buffer. + */ + virtual size_t get_packed_params_size(void) const = 0; + virtual void set_packed_params_buffer(void *) = 0; + + virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0; + virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0; + virtual void pack_params( + void *buffer, + const void* weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const = 0; + + /* Working space is used to pad tensors on the fly. Before running any + * inference check the amount of space required, allocate and provide a + * pointer to the convolution engine. + */ + virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; + virtual void set_working_space(void *) = 0; + virtual unsigned int get_window(void) const = 0; - virtual void set_offsets(int input_offset, int weights_offset) = 0; - virtual void run(const unsigned int start, const unsigned int stop) = 0; + virtual void run( + unsigned int start, + unsigned int stop, + unsigned int threadid=0 + ) = 0; }; template < - int OutputTileRows, - int OutputTileCols, - int KernelRows, - int KernelCols, - int StrideRows, - int StrideCols, - typename TIn, - typename TOut + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols, + typename TIn, typename TBias, typename TOut, + typename Derived > -class DepthwiseConvolution : public IDepthwiseConvolution +class DepthwiseConvolutionBase : public IDepthwiseConvolution { public: - typedef TIn InputType; - typedef TOut OutputType; - // Information about the specific convolution instance + using InputType = TIn; + using BiasType = TBias; + using OutputType = TOut; static constexpr int output_tile_rows = OutputTileRows; static constexpr int output_tile_cols = OutputTileCols; static constexpr int kernel_rows = KernelRows; @@ -71,260 +113,84 @@ class DepthwiseConvolution : public IDepthwiseConvolution /** Create a new depthwise convolution engine. * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_same True if padding is SAME, else VALID. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, padding_same, - weights, input, output, 0 /* column stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_top Padding to apply to top of input. - * @param[in] padding_left Padding to apply to left of input. - * @param[in] padding_bottom Padding to apply to bottom of input. - * @param[in] padding_right Padding to apply to right of input. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - padding_top, padding_left, padding_bottom, padding_right, - weights, input, output, 0 /* column stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine with a specified column stride. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_same True if padding is SAME, else VALID. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] col_stride Stride between columns of the weights, inputs and output tensors. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int col_stride - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, padding_same, - weights, input, output, - col_stride, 0, /* Weight row stride = default */ - col_stride, 0, 0, /* Input row stride, batch stride = default */ - col_stride, 0, 0 /* Output row stride, batch stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine with a specified column stride. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_top Padding to apply to top of input. - * @param[in] padding_left Padding to apply to left of input. - * @param[in] padding_bottom Padding to apply to bottom of input. - * @param[in] padding_right Padding to apply to right of input. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] col_stride Stride between columns of the weights, inputs and output tensors. - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output, - const int col_stride - ) : DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - padding_top, padding_left, padding_bottom, padding_right, - weights, input, output, - col_stride, 0, /* Weight row stride = default */ - col_stride, 0, 0, /* Input row stride, batch stride = default */ - col_stride, 0, 0 /* Output row stride, batch stride = default */ - ) - { - } - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_same True if padding is SAME, else VALID. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately). - * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately). - * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately). - * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately). - * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately). - * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately). - * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately). - * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately). - */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, bool padding_same, - const TIn* const weights, - const TIn* const input, - TOut* const output, - int weight_col_stride, - int weight_row_stride, - int input_col_stride, - int input_row_stride, - int input_batch_stride, - int output_col_stride, - int output_row_stride, - int output_batch_stride - ); - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - * @param[in] padding_top Padding to apply to top of input. - * @param[in] padding_left Padding to apply to left of input. - * @param[in] padding_bottom Padding to apply to bottom of input. - * @param[in] padding_right Padding to apply to right of input. - * @param[in] weights Pointer to Height x Width x Channel ordered weights. - * @param[in] input Pointer to NHWC ordered input tensor. - * @param[out] output Pointer to NHWC ordered output tensor. - * @param[in] weight_col_stride Stride between columns of the weights (if 0, defaults appropriately). - * @param[in] weight_row_stride Stride between rows of the weights (if 0, defaults appropriately). - * @param[in] input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately). - * @param[in] input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately). - * @param[in] input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately). - * @param[in] output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately). - * @param[in] output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately). - * @param[in] output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately). + * @param[in] n_batches Number of batches tensors. + * @param[in] n_input_rows Number of rows in input tensor. + * @param[in] n_input_cols Number of columns in input tensor. + * @param[in] n_channels Number of channels in input and output tensors. */ - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, - int n_channels, + DepthwiseConvolutionBase( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, unsigned int padding_top, unsigned int padding_left, unsigned int padding_bottom, - unsigned int padding_right, - const TIn* const weights, - const TIn* const input, - TOut* const output, - int weight_col_stride, - int weight_row_stride, - int input_col_stride, - int input_row_stride, - int input_batch_stride, - int output_col_stride, - int output_row_stride, - int output_batch_stride + unsigned int padding_right ); // Cannot copy or move a DepthwiseConvolution. - DepthwiseConvolution(DepthwiseConvolution&) = delete; - DepthwiseConvolution operator=(DepthwiseConvolution&) = delete; + DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete; + DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete; + + /* Set input tensor and stride. */ + void set_input(const void *inptr) override; + void set_input(const void *inptr, int column_stride) override; + void set_input(const void *inptr, int row_stride, int column_stride) override; + void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; + + /* Set output tensor and stride. */ + void set_output(void *outptr) override; + void set_output(void *outptr, int column_stride) override; + void set_output(void *outptr, int row_stride, int column_stride) override; + void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; /** Get the number of output rows/columns. * * @param[in] dim_size Number of elements in the dimension (rows/columns) * @param[in] same_padding True if the padding is SAME, otherwise false. */ - static int get_output_size(int dim_size, bool padding_same); static int get_output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after + int dim_size, unsigned int padding_before, unsigned int padding_after ); - /** Get the number of output rows/columns. - * - * @param[in] dim_size Number of elements in the dimension (rows/columns) - * @param[in] same_padding True if the padding is SAME, otherwise false. + int output_size( + int dim_size, unsigned int padding_before, unsigned int padding_after + ) const override; + + /* Determine how much memory is required to store the packed weights and + * biases. */ - int output_size(int dim_size, bool padding_same) const override - { - return DepthwiseConvolution< - OutputTileRows, - OutputTileCols, - KernelRows, - KernelCols, - StrideRows, - StrideCols, - TIn, TOut - >::get_output_size(dim_size, padding_same); - } + size_t get_packed_params_size(void) const override; - int output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after - ) const override - { - return DepthwiseConvolution< - OutputTileRows, - OutputTileCols, - KernelRows, - KernelCols, - StrideRows, - StrideCols, - TIn, TOut - >::get_output_size(dim_size, padding_before, padding_after); - } - - /** Sets quantization offsets - * - * @param[in] input_offset Input offset - * @param[in] weights_offset Weights offset + /* Set the buffer for the packed weights and biases, and perform the + * packing. + */ + void set_packed_params_buffer(void *buffer) override; + + void pack_params(const void *weights, const void *biases=nullptr) const override; + + void pack_params( + void *buffer, + const void *weights, + const void *biases=nullptr + ) const override; + + void pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const override; + + /** Query the amount of working space required. + * @param[in] The largest number of threads which will be used to execute + * the kernel. + */ + size_t get_working_space_size(unsigned int n_threads=1) const override; + + /** Set the working space buffer. */ - void set_offsets(int input_offset, int weights_offset) override; + void set_working_space(void *buffer) override; /** Get the window of work to be performed by an instance of the operator. */ @@ -336,122 +202,282 @@ class DepthwiseConvolution : public IDepthwiseConvolution * * @param[in] start Start of the window of work to perform. * @param[in] stop End of the work to perform. + * @param[in] ID of the thread performing the work. */ - void run(unsigned int start, unsigned int stop) override; + void run( + unsigned int start, + unsigned int stop, + unsigned int threadid=0 + ) override; protected: + /** Get the value to use to pad the tensor. + */ + TIn _input_padding_value(void) const; + + /** Implementation of the parameter packing. + */ + void _pack_params( + void *buffer, + const void *weights, + unsigned int weight_row_stride, + unsigned int weight_col_stride, + const void *biases=nullptr + ) const; + /** Process a tile-row of the tensors. */ - static void process_tile_row( + void process_tile_row( + unsigned int threadid, int n_channels, - const TIn* const weights, - const int weight_row_stride, - const int weight_col_stride, - const TIn* const inptr, - int in_row_stride, - int in_col_stride, - TOut* const outptr, - int out_row_stride, - int out_col_stride, + const void* packed_params, + const InputType* inptr, + OutputType* outptr, int row_pad_in_top, int row_pad_in_left, int row_pad_in_bottom, int row_pad_out_bottom, int n_tiles, int n_input_cols, - int n_output_cols, - int input_offset, - int weights_offset + int n_output_cols ); - // Determine the maximum (and minimum) padding values which can be applied - // to tiles of the tensors involved in this class of convolution. - static constexpr int max_in_pad_top = (kernel_rows - 1) / 2; - static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2; - - static constexpr int max_in_pad_left = (kernel_cols - 1) / 2; - static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2; - - static constexpr int max_in_pad_bottom = inner_tile_rows; - static constexpr int max_in_pad_right = inner_tile_cols; - static constexpr int max_out_pad_bottom = output_tile_rows; - static constexpr int max_out_pad_right = output_tile_cols; - - static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1; - static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1; - static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1; - static constexpr int n_in_pad_right_fns = max_in_pad_right + 1; - static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1; - static constexpr int n_out_pad_right_fns = max_out_pad_right + 1; - - /** Pointer to a function which will process a tile. + /** Process a single tile of the tensor. * - * @param[in] n_channels Number of channels. - * @param[in] weights Pointer to Height x Width x Channels ordered weights. - * @param[in] inptr Pointer to the top-left unpadded value of the tile. - * @param[in] in_row_stride Stride between rows of the input tensor. - * @param[in] in_col_stride Stride between columns of the input tensor. - * @param[out] outptr Pointer to the top-left output value for the tile. - * @param[in] out_row_stride Stride between rows of the output tensor. - * @param[in] out_col_stride Stride between columns of the output tensor. - * - * The following parameters may be ignored if the function has been - * specialised for specific padding constraints. - * - * @param[in] _in_pad_top Padding to apply to top of input tile. - * @param[in] _in_pad_left Padding to apply to left of input tile. - * @param[in] _in_pad_bottom Padding to apply to bottom of input tile. - * @param[in] _in_pad_right Padding to apply to right of input tile. - * @param[in] _out_pad_bottom Null cells at bottom of output tile. - * @param[in] _out_pad_right Null cells at right of output tile. + * This method will apply input/output padding (if required) and call the + * depthwise tile implementation. */ - typedef void (*TileFn)( + void process_tile( + unsigned int threadid, int n_channels, - const TIn* const weights, - int weight_row_stride, - int weight_col_stride, - const TIn* const inptr, - int in_row_stride, - int in_col_stride, - TOut* const outptr, - int out_row_stride, - int out_col_stride, - int _in_pad_top, - int _in_pad_left, - int _in_pad_bottom, - int _in_pad_right, - int _out_pad_bottom, - int _out_pad_right, - int _input_offset, - int _weights_offset + const void* packed_params, + const InputType* inptr, + OutputType* outptr, + int pad_in_top, + int pad_in_left, + int pad_in_bottom, + int pad_in_right, + int pad_out_bottom, + int pad_out_right ); - /* Arrays of methods to process tensor tiles. - * - * Allows dynamic dispatch to specialized implementations based on - * different padding configurations. + /** Perform depthwise convolution on a single tile. */ - static const TileFn tilefn_unpadded; - static const TileFn tilefn_top[n_in_pad_top_fns]; - static const TileFn tilefn_left[n_in_pad_left_fns]; - static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns]; - static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns]; - static const TileFn tilefn_generic; + template <nck::ActivationFunction Activation> + void execute_tile( + int n_channels, + const void* packed_params, + const InputType* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + OutputType* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); + + int n_channels(void) const; private: // Member variables of instances of a convolution engine. - const TIn* const _weights; - const TIn* const _input; - TOut* const _output; + const InputType* _input; + OutputType* _output; + void* _packed_parameters; + void* _working_space; // Per-thread working space const int _n_batches, _n_input_rows, _n_input_cols, _n_channels, _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols; const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right; + const nck::ActivationFunction _activation; // Stride information for a convolution instance - const int _weight_col_stride, _weight_row_stride; - const int _input_col_stride, _input_row_stride, _input_batch_stride; - const int _output_col_stride, _output_row_stride, _output_batch_stride; - int _input_offset, _weights_offset; + int _input_col_stride, _input_row_stride, _input_batch_stride; + const int _input_ws_col_stride, _input_ws_row_stride; + int _output_col_stride, _output_row_stride, _output_batch_stride; + const int _output_ws_col_stride, _output_ws_row_stride; + + // Methods for getting access to working space + size_t _get_input_working_space_size(void) const; + size_t _get_output_working_space_size(void) const; + + void *_get_input_working_space(unsigned int threadid) const; + void *_get_output_working_space(unsigned int threadid) const; }; + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols, + typename TIn, typename TBias, typename TOut +> +class DepthwiseConvolution : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut + > +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + TIn, TBias, TOut + > >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + using Base::DepthwiseConvolutionBase; + + protected: + template <nck::ActivationFunction Activation> + void execute_tile( + int n_channels, + const void* packed_params, + const TIn* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + TOut* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); +}; + + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float +> : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float + > +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float, float, float + > >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + protected: + template <nck::ActivationFunction Activation> + void execute_tile( + int n_channels, + const void* packed_params, + const float* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + float* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); +}; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +class DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t +> : public DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t + > +> +{ + using Base = DepthwiseConvolutionBase< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t, + DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, + StrideRows, StrideCols, + float16_t, float16_t, float16_t + > >; + friend Base; + using InputType = typename Base::InputType; + using OutputType = typename Base::OutputType; + + public: + DepthwiseConvolution( + int n_batches, int n_input_rows, int n_input_cols, int n_channels, + nck::ActivationFunction activation, + unsigned int padding_top, + unsigned int padding_left, + unsigned int padding_bottom, + unsigned int padding_right + ); + + protected: + template <nck::ActivationFunction Activation> + void execute_tile( + int n_channels, + const void* packed_params, + const float16_t* inptr, + unsigned int in_row_stride, + unsigned int in_col_stride, + float16_t* outptr, + unsigned int out_row_stride, + unsigned int out_col_stride + ); +}; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC } // namespace depthwise |