COMPMID-1975: Update depthwise convolution.

Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/898 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2019-03-11 14:03:23 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2019-03-29 09:54:53 +0000
commit: 47d39dc615d1dee2482bc84699802165a9778ac8 (patch)
tree: 87f2fdb4f4957be7ff1c043be6328e4154cdf9e1 /arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
parent: 2d2551ed3934f071eb6a65f5b776301454bc147a (diff)
download: ComputeLibrary-47d39dc615d1dee2482bc84699802165a9778ac8.tar.gz
1 files changed, 364 insertions, 338 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index 6d9cb18f44..45e8da0272 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -24,42 +24,84 @@
 
 #pragma once
 
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/activation.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/padding.hpp"
+
 namespace depthwise
 {
 
+namespace nck = neon_convolution_kernels;
+
 class IDepthwiseConvolution
 {
   public:
     virtual ~IDepthwiseConvolution() = default;
-    virtual int output_size(const int dim_size, const bool padding_same) const = 0;
+
     virtual int output_size(
       int dim_size,
       unsigned int padding_before,
       unsigned int padding_after
     ) const = 0;
 
+    /* Set input tensor and stride. */
+    virtual void set_input(const void *inptr) = 0;
+    virtual void set_input(const void *inptr, int column_stride) = 0;
+    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
+    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+    /* Set output tensor and stride. */
+    virtual void set_output(void *outptr) = 0;
+    virtual void set_output(void *outptr, int column_stride) = 0;
+    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
+    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+    /* Weights and biases are re-ordered to improve memory access patterns. Use
+     * these methods to determine the size of the re-pack buffer and to set the
+     * address (and implicitly reorder the weights and biases into) the buffer.
+     */
+    virtual size_t get_packed_params_size(void) const = 0;
+    virtual void set_packed_params_buffer(void *) = 0;
+
+    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
+    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
+    virtual void pack_params(
+      void *buffer,
+      const void* weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const = 0;
+
+    /* Working space is used to pad tensors on the fly. Before running any
+     * inference check the amount of space required, allocate and provide a
+     * pointer to the convolution engine.
+     */
+    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
+    virtual void set_working_space(void *) = 0;
+
     virtual unsigned int get_window(void) const = 0;
-    virtual void set_offsets(int input_offset, int weights_offset) = 0;
-    virtual void run(const unsigned int start, const unsigned int stop) = 0;
+    virtual void run(
+      unsigned int start,
+      unsigned int stop,
+      unsigned int threadid=0
+    ) = 0;
 };
 
 template <
-  int OutputTileRows,
-  int OutputTileCols,
-  int KernelRows,
-  int KernelCols,
-  int StrideRows,
-  int StrideCols,
-  typename TIn,
-  typename TOut
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut,
+  typename Derived
 >
-class DepthwiseConvolution : public IDepthwiseConvolution
+class DepthwiseConvolutionBase : public IDepthwiseConvolution
 {
   public:
-    typedef TIn InputType;
-    typedef TOut OutputType;
-
     // Information about the specific convolution instance
+    using InputType = TIn;
+    using BiasType = TBias;
+    using OutputType = TOut;
     static constexpr int output_tile_rows = OutputTileRows;
     static constexpr int output_tile_cols = OutputTileCols;
     static constexpr int kernel_rows = KernelRows;
@@ -71,260 +113,84 @@ class DepthwiseConvolution : public IDepthwiseConvolution
 
     /** Create a new depthwise convolution engine.
      *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_same True if padding is SAME, else VALID.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels, bool padding_same,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
-      weights, input, output, 0 /* column stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_top Padding to apply to top of input.
-     * @param[in]  padding_left Padding to apply to left of input.
-     * @param[in]  padding_bottom Padding to apply to bottom of input.
-     * @param[in]  padding_right Padding to apply to right of input.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      padding_top, padding_left, padding_bottom, padding_right,
-      weights, input, output, 0 /* column stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine with a specified column stride.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_same True if padding is SAME, else VALID.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  col_stride Stride between columns of the weights, inputs and output tensors.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels, bool padding_same,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      const int col_stride
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels, padding_same,
-      weights, input, output,
-      col_stride, 0,    /* Weight row stride = default */
-      col_stride, 0, 0, /* Input row stride, batch stride = default */
-      col_stride, 0, 0  /* Output row stride, batch stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine with a specified column stride.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_top Padding to apply to top of input.
-     * @param[in]  padding_left Padding to apply to left of input.
-     * @param[in]  padding_bottom Padding to apply to bottom of input.
-     * @param[in]  padding_right Padding to apply to right of input.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  col_stride Stride between columns of the weights, inputs and output tensors.
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      const int col_stride
-    ) : DepthwiseConvolution(
-      n_batches, n_input_rows, n_input_cols, n_channels,
-      padding_top, padding_left, padding_bottom, padding_right,
-      weights, input, output,
-      col_stride, 0,    /* Weight row stride = default */
-      col_stride, 0, 0, /* Input row stride, batch stride = default */
-      col_stride, 0, 0  /* Output row stride, batch stride = default */
-    )
-    {
-    }
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_same True if padding is SAME, else VALID.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
-     * @param[in]  weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
-     * @param[in]  input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
-     * @param[in]  output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
-     */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels, bool padding_same,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      int weight_col_stride,
-      int weight_row_stride,
-      int input_col_stride,
-      int input_row_stride,
-      int input_batch_stride,
-      int output_col_stride,
-      int output_row_stride,
-      int output_batch_stride
-    );
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in]  n_batches Number of batches tensors.
-     * @param[in]  n_input_rows Number of rows in input tensor.
-     * @param[in]  n_input_cols Number of columns in input tensor.
-     * @param[in]  n_channels Number of channels in input and output tensors.
-     * @param[in]  padding_top Padding to apply to top of input.
-     * @param[in]  padding_left Padding to apply to left of input.
-     * @param[in]  padding_bottom Padding to apply to bottom of input.
-     * @param[in]  padding_right Padding to apply to right of input.
-     * @param[in]  weights Pointer to Height x Width x Channel ordered weights.
-     * @param[in]  input Pointer to NHWC ordered input tensor.
-     * @param[out] output Pointer to NHWC ordered output tensor.
-     * @param[in]  weight_col_stride Stride between columns of the weights (if 0, defaults appropriately).
-     * @param[in]  weight_row_stride Stride between rows of the weights (if 0, defaults appropriately).
-     * @param[in]  input_col_stride Stride between columns of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_row_stride Stride between rows of the input tensor (if 0, defaults appropriately).
-     * @param[in]  input_batch_stride Stride between batches of the input tensor (if 0, defaults appropriately).
-     * @param[in]  output_col_stride Stride between columns of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_row_stride Stride between rows of the output tensor (if 0, defaults appropriately).
-     * @param[in]  output_batch_stride Stride between batches of the output tensor (if 0, defaults appropriately).
+     * @param[in] n_batches Number of batches tensors.
+     * @param[in] n_input_rows Number of rows in input tensor.
+     * @param[in] n_input_cols Number of columns in input tensor.
+     * @param[in] n_channels Number of channels in input and output tensors.
      */
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols,
-      int n_channels,
+    DepthwiseConvolutionBase(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
       unsigned int padding_top,
       unsigned int padding_left,
       unsigned int padding_bottom,
-      unsigned int padding_right,
-      const TIn* const weights,
-      const TIn* const input,
-      TOut* const output,
-      int weight_col_stride,
-      int weight_row_stride,
-      int input_col_stride,
-      int input_row_stride,
-      int input_batch_stride,
-      int output_col_stride,
-      int output_row_stride,
-      int output_batch_stride
+      unsigned int padding_right
     );
 
     // Cannot copy or move a DepthwiseConvolution.
-    DepthwiseConvolution(DepthwiseConvolution&) = delete;
-    DepthwiseConvolution operator=(DepthwiseConvolution&) = delete;
+    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
+    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
+
+    /* Set input tensor and stride. */
+    void set_input(const void *inptr) override;
+    void set_input(const void *inptr, int column_stride) override;
+    void set_input(const void *inptr, int row_stride, int column_stride) override;
+    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+    /* Set output tensor and stride. */
+    void set_output(void *outptr) override;
+    void set_output(void *outptr, int column_stride) override;
+    void set_output(void *outptr, int row_stride, int column_stride) override;
+    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
 
     /** Get the number of output rows/columns.
      *
      * @param[in] dim_size Number of elements in the dimension (rows/columns)
      * @param[in] same_padding True if the padding is SAME, otherwise false.
      */
-    static int get_output_size(int dim_size, bool padding_same);
     static int get_output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after
+      int dim_size, unsigned int padding_before, unsigned int padding_after
     );
 
-    /** Get the number of output rows/columns.
-     *
-     * @param[in] dim_size Number of elements in the dimension (rows/columns)
-     * @param[in] same_padding True if the padding is SAME, otherwise false.
+    int output_size(
+      int dim_size, unsigned int padding_before, unsigned int padding_after
+    ) const override;
+
+    /* Determine how much memory is required to store the packed weights and
+     * biases.
      */
-    int output_size(int dim_size, bool padding_same) const override
-    {
-      return DepthwiseConvolution<
-        OutputTileRows,
-        OutputTileCols,
-        KernelRows,
-        KernelCols,
-        StrideRows,
-        StrideCols,
-        TIn, TOut
-      >::get_output_size(dim_size, padding_same);
-    }
+    size_t get_packed_params_size(void) const override;
 
-    int output_size(
-        int dim_size,
-        unsigned int padding_before,
-        unsigned int padding_after
-    ) const override
-    {
-      return DepthwiseConvolution<
-        OutputTileRows,
-        OutputTileCols,
-        KernelRows,
-        KernelCols,
-        StrideRows,
-        StrideCols,
-        TIn, TOut
-      >::get_output_size(dim_size, padding_before, padding_after);
-    }
-
-    /** Sets quantization offsets
-     *
-     * @param[in] input_offset   Input offset
-     * @param[in] weights_offset Weights offset
+    /* Set the buffer for the packed weights and biases, and perform the
+     * packing.
+     */
+    void set_packed_params_buffer(void *buffer) override;
+
+    void pack_params(const void *weights, const void *biases=nullptr) const override;
+
+    void pack_params(
+      void *buffer,
+      const void *weights,
+      const void *biases=nullptr
+    ) const override;
+
+    void pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const override;
+
+    /** Query the amount of working space required.
+     * @param[in] The largest number of threads which will be used to execute
+     *            the kernel.
+     */
+    size_t get_working_space_size(unsigned int n_threads=1) const override;
+
+    /** Set the working space buffer.
      */
-     void set_offsets(int input_offset, int weights_offset) override;
+    void set_working_space(void *buffer) override;
 
     /** Get the window of work to be performed by an instance of the operator.
      */
@@ -336,122 +202,282 @@ class DepthwiseConvolution : public IDepthwiseConvolution
      *
      * @param[in] start Start of the window of work to perform.
      * @param[in] stop End of the work to perform.
+     * @param[in] ID of the thread performing the work.
      */
-    void run(unsigned int start, unsigned int stop) override;
+    void run(
+      unsigned int start,
+      unsigned int stop,
+      unsigned int threadid=0
+    ) override;
 
   protected:
+    /** Get the value to use to pad the tensor.
+     */
+    TIn _input_padding_value(void) const;
+
+    /** Implementation of the parameter packing.
+     */
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
     /** Process a tile-row of the tensors.
      */
-    static void process_tile_row(
+    void process_tile_row(
+      unsigned int threadid,
       int n_channels,
-      const TIn* const weights,
-      const int weight_row_stride,
-      const int weight_col_stride,
-      const TIn* const inptr,
-      int in_row_stride,
-      int in_col_stride,
-      TOut* const outptr,
-      int out_row_stride,
-      int out_col_stride,
+      const void* packed_params,
+      const InputType* inptr,
+      OutputType* outptr,
       int row_pad_in_top,
       int row_pad_in_left,
       int row_pad_in_bottom,
       int row_pad_out_bottom,
       int n_tiles,
       int n_input_cols,
-      int n_output_cols,
-      int input_offset,
-      int weights_offset
+      int n_output_cols
     );
 
-    // Determine the maximum (and minimum) padding values which can be applied
-    // to tiles of the tensors involved in this class of convolution.
-    static constexpr int max_in_pad_top = (kernel_rows - 1) / 2;
-    static constexpr int min_in_pad_top = (kernel_rows - stride_rows) / 2;
-
-    static constexpr int max_in_pad_left = (kernel_cols - 1) / 2;
-    static constexpr int min_in_pad_left = (kernel_cols - stride_cols) / 2;
-
-    static constexpr int max_in_pad_bottom = inner_tile_rows;
-    static constexpr int max_in_pad_right = inner_tile_cols;
-    static constexpr int max_out_pad_bottom = output_tile_rows;
-    static constexpr int max_out_pad_right = output_tile_cols;
-
-    static constexpr int n_in_pad_top_fns = (max_in_pad_top - min_in_pad_top) + 1;
-    static constexpr int n_in_pad_left_fns = (max_in_pad_left - min_in_pad_left) + 1;
-    static constexpr int n_in_pad_bottom_fns = max_in_pad_bottom + 1;
-    static constexpr int n_in_pad_right_fns = max_in_pad_right + 1;
-    static constexpr int n_out_pad_bottom_fns = max_out_pad_bottom + 1;
-    static constexpr int n_out_pad_right_fns = max_out_pad_right + 1;
-
-    /** Pointer to a function which will process a tile.
+    /** Process a single tile of the tensor.
      *
-     * @param[in] n_channels Number of channels.
-     * @param[in] weights Pointer to Height x Width x Channels ordered weights.
-     * @param[in] inptr Pointer to the top-left unpadded value of the tile.
-     * @param[in] in_row_stride Stride between rows of the input tensor.
-     * @param[in] in_col_stride Stride between columns of the input tensor.
-     * @param[out] outptr Pointer to the top-left output value for the tile.
-     * @param[in] out_row_stride Stride between rows of the output tensor.
-     * @param[in] out_col_stride Stride between columns of the output tensor.
-     *
-     * The following parameters may be ignored if the function has been
-     * specialised for specific padding constraints.
-     *
-     * @param[in] _in_pad_top Padding to apply to top of input tile.
-     * @param[in] _in_pad_left Padding to apply to left of input tile.
-     * @param[in] _in_pad_bottom Padding to apply to bottom of input tile.
-     * @param[in] _in_pad_right Padding to apply to right of input tile.
-     * @param[in] _out_pad_bottom Null cells at bottom of output tile.
-     * @param[in] _out_pad_right Null cells at right of output tile.
+     * This method will apply input/output padding (if required) and call the
+     * depthwise tile implementation.
      */
-    typedef void (*TileFn)(
+    void process_tile(
+      unsigned int threadid,
       int n_channels,
-      const TIn* const weights,
-      int weight_row_stride,
-      int weight_col_stride,
-      const TIn* const inptr,
-      int in_row_stride,
-      int in_col_stride,
-      TOut* const outptr,
-      int out_row_stride,
-      int out_col_stride,
-      int _in_pad_top,
-      int _in_pad_left,
-      int _in_pad_bottom,
-      int _in_pad_right,
-      int _out_pad_bottom,
-      int _out_pad_right,
-      int _input_offset,
-      int _weights_offset
+      const void* packed_params,
+      const InputType* inptr,
+      OutputType* outptr,
+      int pad_in_top,
+      int pad_in_left,
+      int pad_in_bottom,
+      int pad_in_right,
+      int pad_out_bottom,
+      int pad_out_right
     );
 
-    /* Arrays of methods to process tensor tiles.
-     *
-     * Allows dynamic dispatch to specialized implementations based on
-     * different padding configurations.
+    /** Perform depthwise convolution on a single tile.
      */
-    static const TileFn tilefn_unpadded;
-    static const TileFn tilefn_top[n_in_pad_top_fns];
-    static const TileFn tilefn_left[n_in_pad_left_fns];
-    static const TileFn tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns];
-    static const TileFn tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns];
-    static const TileFn tilefn_generic;
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      OutputType* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    int n_channels(void) const;
 
   private:
     // Member variables of instances of a convolution engine.
-    const TIn* const _weights;
-    const TIn* const _input;
-    TOut* const _output;
+    const InputType* _input;
+    OutputType* _output;
+    void* _packed_parameters;
+    void* _working_space;  // Per-thread working space
     const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
               _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
     const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
+    const nck::ActivationFunction _activation;
 
     // Stride information for a convolution instance
-    const int _weight_col_stride, _weight_row_stride;
-    const int _input_col_stride, _input_row_stride, _input_batch_stride;
-    const int _output_col_stride, _output_row_stride, _output_batch_stride;
-    int _input_offset, _weights_offset;
+    int _input_col_stride, _input_row_stride, _input_batch_stride;
+    const int _input_ws_col_stride, _input_ws_row_stride;
+    int _output_col_stride, _output_row_stride, _output_batch_stride;
+    const int _output_ws_col_stride, _output_ws_row_stride;
+
+    // Methods for getting access to working space
+    size_t _get_input_working_space_size(void) const;
+    size_t _get_output_working_space_size(void) const;
+
+    void *_get_input_working_space(unsigned int threadid) const;
+    void *_get_output_working_space(unsigned int threadid) const;
 };
 
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut
+>
+class DepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  TIn, TBias, TOut,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    TIn, TBias, TOut
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    TIn, TBias, TOut,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      TIn, TBias, TOut
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    using Base::DepthwiseConvolutionBase;
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const TIn* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      TOut* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+};
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float, float, float
+> : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float, float, float,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float, float
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float, float,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      float, float, float
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      float* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+> : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float16_t, float16_t, float16_t,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t, float16_t
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t, float16_t,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      float16_t, float16_t, float16_t
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float16_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      float16_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+};
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 }  // namespace depthwise
author	Georgios Pinitas <georgios.pinitas@arm.com>	2019-03-11 14:03:23 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2019-03-29 09:54:53 +0000
commit	47d39dc615d1dee2482bc84699802165a9778ac8 (patch)
tree	87f2fdb4f4957be7ff1c043be6328e4154cdf9e1 /arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
parent	2d2551ed3934f071eb6a65f5b776301454bc147a (diff)
download	ComputeLibrary-47d39dc615d1dee2482bc84699802165a9778ac8.tar.gz