From 7df27869aff38b07b50e4fe589f6b2cf51954a92 Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Wed, 30 May 2018 11:44:26 +0100
Subject: COMPMID-1162: Enable NHWC data layout support for
 NEWinogradConvolutionLayer - part1

In this first part we reworked the configuration of the kernels as before we
passed the raw pointer to the buffer within the configuration of the function

Change-Id: I83d3cb64c562303093c7f0ae52395ecd080a5d52
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/133560
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 .../kernels/NEWinogradConvolutionLayerKernel.h     | 198 ++++++++++++---------
 1 file changed, 117 insertions(+), 81 deletions(-)

(limited to 'arm_compute/core/NEON')
diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 6b8866cb2e..68c133ee37 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -42,15 +42,15 @@ public:
     /** Determine how much memory (in units of TIn) to allocate for the
      * transformed input.
      *
-     * @param[in] n_batches    Number of batches in the input tensor.
-     * @param[in] n_channels   Number of feature maps in the input tensor.
-     * @param[in] n_rows       Number of rows in each feature map.
-     * @param[in] n_cols       Number of columns in each feature map.
+     * @param[in] num_batches  Number of batches in the input tensor.
+     * @param[in] num_channels Number of feature maps in the input tensor.
+     * @param[in] num_rows     Number of rows in each feature map.
+     * @param[in] num_cols     Number of columns in each feature map.
      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
      *
      * @return Storage size (in units of TIn) required.
      */
-    virtual unsigned int get_input_storage_size(int n_batches, int n_channels, int n_rows, int n_cols, bool same_padding) const = 0;
+    virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
 
     /** Gets the stride between matrices in the input worspace
      *
@@ -64,16 +64,17 @@ public:
 
     /** Configure the output transform kernel.
      *
-     * @param[in]  input         Input tensor data
-     * @param[in]  n_batches     Number of batches in input tensor.
-     * @param[in]  n_rows        Number of rows in input tensor.
-     * @param[in]  n_cols        Number of columns in input tensor.
-     * @param[in]  n_channels    Number of channels in input tensor.
+     * @param[in]  input_nhwc    Input tensor in NHWC data layout format.
+     * @param[in]  num_batches   Number of batches in input tensor.
+     * @param[in]  num_rows      Number of rows in input tensor.
+     * @param[in]  num_cols      Number of columns in input tensor.
+     * @param[in]  num_channels  Number of channels in input tensor.
      * @param[in]  padding       Padding type.
      * @param[out] output        Base of output matrices.
      * @param[in]  matrix_stride Stride between output matrices.
      */
-    virtual void configure(const T *const input, const int n_batches, const int n_rows, const int n_cols, const int n_channels, const PaddingType padding, T *const output, const int matrix_stride) = 0;
+    virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
+                           const PaddingType padding, T *const output, const int matrix_stride) = 0;
 
     /** Destructor */
     virtual ~INEWinogradLayerTransformInputKernel()
@@ -86,22 +87,33 @@ template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, in
 class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel<T>
 {
 public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default;
+    /** Default destructor */
+    ~NEWinogradLayerTransformInputKernel() = default;
+
     /** Determine how much memory (in units of TIn) to allocate for the
      * transformed input.
      *
-     * @param[in] n_batches    Number of batches in the input tensor.
-     * @param[in] n_channels   Number of feature maps in the input tensor.
-     * @param[in] n_rows       Number of rows in each feature map.
-     * @param[in] n_cols       Number of columns in each feature map.
+     * @param[in] num_batches  Number of batches in the input tensor.
+     * @param[in] num_channels Number of feature maps in the input tensor.
+     * @param[in] num_rows     Number of rows in each feature map.
+     * @param[in] num_cols     Number of columns in each feature map.
      * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
      *
      * @return Storage size (in units of TIn) required.
      */
     unsigned int get_input_storage_size(
-        int  n_batches,
-        int  n_channels,
-        int  n_rows,
-        int  n_cols,
+        int  num_batches,
+        int  num_channels,
+        int  num_rows,
+        int  num_cols,
         bool same_padding) const override;
 
     /** Gets the stride between matrices in the input worspace
@@ -124,21 +136,21 @@ public:
 
     /** Configure the output transform kernel.
      *
-     * @param[in]  input         Input tensor data. Data types supported: F32.
-     * @param[in]  n_batches     Number of batches in input tensor.
-     * @param[in]  n_rows        Number of rows in input tensor.
-     * @param[in]  n_cols        Number of columns in input tensor.
-     * @param[in]  n_channels    Number of channels in input tensor.
+     * @param[in]  input_nhwc    Input tensor.  Data types supported: F32. Layout supported NHWC.
+     * @param[in]  num_batches   Number of batches in input tensor.
+     * @param[in]  num_rows      Number of rows in input tensor.
+     * @param[in]  num_cols      Number of columns in input tensor.
+     * @param[in]  num_channels  Number of channels in input tensor.
      * @param[in]  padding       Padding type.
      * @param[out] output        Base of output matrices.
      * @param[in]  matrix_stride Stride between output matrices.
      */
     void configure(
-        const T *const    input,
-        const int         n_batches,
-        const int         n_rows,
-        const int         n_cols,
-        const int         n_channels,
+        const ITensor    *input_nhwc,
+        const int         num_batches,
+        const int         num_rows,
+        const int         num_cols,
+        const int         num_channels,
         const PaddingType padding,
         T *const          output,
         const int         matrix_stride) override;
@@ -163,7 +175,14 @@ public:
 
 private:
     using InputTransform = typename WinogradBase::template InputTransform<T>;
-    std::unique_ptr<InputTransform> _transform;
+    const ITensor *_input_nhwc;
+    int            _num_batches;   /**< Number of batches in input tensor. */
+    int            _num_rows;      /**< Number of rows in input tensor. */
+    int            _num_cols;      /**< Number of columns in input tensor. */
+    int            _num_channels;  /**< Number of channels in input tensor. */
+    PaddingType    _padding;       /**< Padding type. */
+    T             *_output;        /**< Base of output matrices. */
+    int            _matrix_stride; /**< Stride between output matrices. */
 };
 
 /** Interface for the NEON kernel to perform Winograd output transform. */
@@ -174,15 +193,15 @@ public:
     /** Determine how much memory (in units of TOut) to allocate for the
      * (Winograd domain) output.
      *
-     * @param[in] n_batches         Number of batches in the output tensor.
-     * @param[in] n_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] n_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] n_output_channels Number of feature maps in the output tensor.
-     * @param[in] same_padding      Use "SAME" padding, otherwise use "VALID".
+     * @param[in] num_batches         Number of batches in the output tensor.
+     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
+     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
+     * @param[in] num_output_channels Number of feature maps in the output tensor.
+     * @param[in] same_padding        Use "SAME" padding, otherwise use "VALID".
      *
      * @return Storage size (in units of TOut) required.
      */
-    virtual unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const = 0;
+    virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels, bool same_padding) const = 0;
 
     /** Gets the stride between matrices in the output worspace
      *
@@ -209,21 +228,21 @@ public:
      * @param[in]  biases              Pointer to the biases tensor.
      * @param[in]  output_workingspace Pointer to working space for the output tensor in the Winograd domain.
      * @param[in]  matrix_stride       Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output              Pointer to NHWC ordered output tensor, in the spatial domain.
-     * @param[in]  n_batches           Number of batches in the input tensor.
-     * @param[in]  n_rows              Number of rows in output tensor.
-     * @param[in]  n_cols              Number of columns in output tensor.
-     * @param[in]  n_channels          Number of feature maps in the output tensor.
+     * @param[out] output_nhwc         Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain.
+     * @param[in]  num_batches         Number of batches in the input tensor.
+     * @param[in]  num_rows            Number of rows in output tensor.
+     * @param[in]  num_cols            Number of columns in output tensor.
+     * @param[in]  num_channels        Number of feature maps in the output tensor.
      */
     virtual void configure(
         const ITensor *biases,
         const T *const output_workingspace,
         const int      matrix_stride,
-        T *const       output,
-        const int      n_batches,
-        const int      n_rows,
-        const int      n_cols,
-        const int      n_channels) = 0;
+        ITensor *const output_nhwc,
+        const int      num_batches,
+        const int      num_rows,
+        const int      num_cols,
+        const int      num_channels) = 0;
 
     virtual ~INEWinogradLayerTransformOutputKernel()
     {
@@ -257,15 +276,15 @@ public:
     /** Determine how much memory (in units of TOut) to allocate for the
      * (Winograd domain) output.
      *
-     * @param[in] n_batches         Number of batches in the output tensor.
-     * @param[in] n_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] n_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] n_output_channels Number of feature maps in the output tensor.
-     * @param[in] same_padding      Use "SAME" padding, otherwise use "VALID".
+     * @param[in] num_batches         Number of batches in the output tensor.
+     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
+     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
+     * @param[in] num_output_channels Number of feature maps in the output tensor.
+     * @param[in] same_padding        Use "SAME" padding, otherwise use "VALID".
      *
      * @return Storage size (in units of TOut) required.
      */
-    unsigned int get_output_storage_size(int n_batches, int n_rows, int n_cols, int n_output_channels, bool same_padding) const override;
+    unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels, bool same_padding) const override;
 
     /** Gets the stride between matrices in the output worspace
      *
@@ -291,21 +310,21 @@ public:
      * @param[in]  biases              Pointer to the biases tensor.
      * @param[in]  output_workingspace Pointer to working space for the output tensor in the Winograd domain.
      * @param[in]  matrix_stride       Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output              Pointer to NHWC ordered output tensor, in the spatial domain.
-     * @param[in]  n_batches           Number of batches in the input tensor.
-     * @param[in]  n_rows              Number of rows in output tensor.
-     * @param[in]  n_cols              Number of columns in output tensor.
-     * @param[in]  n_channels          Number of feature maps in the output tensor.
+     * @param[out] output_nhwc         Pointer to a tensor with NHWC data layout, in the spatial domain.
+     * @param[in]  num_batches         Number of batches in the input tensor.
+     * @param[in]  num_rows            Number of rows in output tensor.
+     * @param[in]  num_cols            Number of columns in output tensor.
+     * @param[in]  num_channels        Number of feature maps in the output tensor.
      */
     void configure(
         const ITensor *biases,
         const T *const output_workingspace,
         const int      matrix_stride,
-        T *const       output,
-        const int      n_batches,
-        const int      n_rows,
-        const int      n_cols,
-        const int      n_channels) override;
+        ITensor *const output_nhwc,
+        const int      num_batches,
+        const int      num_rows,
+        const int      num_cols,
+        const int      num_channels) override;
 
     void run(const Window &window, const ThreadInfo &info) override;
 
@@ -329,11 +348,11 @@ private:
     const T       *_output_workspace;
     int            _matrix_stride;
     int            _matrix_row_stride;
-    T             *_output;
-    int            _n_batches;
-    int            _n_rows;
-    int            _n_cols;
-    int            _n_channels;
+    ITensor       *_output_nhwc;
+    int            _num_batches;
+    int            _num_rows;
+    int            _num_cols;
+    int            _num_channels;
 };
 
 /** Interface for the NEON kernel to perform Winograd weights transform. */
@@ -344,12 +363,12 @@ public:
     /** Determine how much memory (in units of T) to allocate for the
      * transformed weights.
      *
-     * @param[in] n_output_channels Number of output feature maps.
-     * @param[in] n_input_channels  Number of input feature maps.
+     * @param[in] num_output_channels Number of output feature maps.
+     * @param[in] num_input_channels  Number of input feature maps.
      *
      * @return Storage size (in units of T) required.
      */
-    virtual unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const = 0;
+    virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0;
     /** Gets the stride between matrices in the kernel worspace
      *
      * @param[in] kernel_shape The shape of the weights tensor.
@@ -360,13 +379,14 @@ public:
 
     /** Configure the weights transform kernel.
      *
-     * @param[in] weights_hwio      Pointer to the weights tensor
-     * @param[in] output            Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in] matrix_stride     Stride across matrices in the output workspace.
-     * @param[in] n_output_channels Number of filters.
-     * @param[in] n_input_channels  Number of channels in each filter.
+     * @param[in] weights_hwio        Pointer to the weights tensor
+     * @param[in] output              Pointer to working space for the output tensor in the Winograd domain.
+     * @param[in] matrix_stride       Stride across matrices in the output workspace.
+     * @param[in] num_output_channels Number of filters.
+     * @param[in] num_input_channels  Number of channels in each filter.
      */
-    virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) = 0;
+
+    virtual void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
 
     virtual ~INEWinogradLayerTransformWeightsKernel()
     {
@@ -378,6 +398,17 @@ template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, in
 class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel<T>
 {
 public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default;
+    /** Default destructor */
+    ~NEWinogradLayerTransformWeightsKernel() = default;
+
     /** Default constructor. */
     NEWinogradLayerTransformWeightsKernel();
     const char *name() const override
@@ -397,8 +428,8 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
 
     // Inherited methods overridden:
-    void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int n_output_channels, const int n_input_channels) override;
-    unsigned int get_weight_storage_size(int n_output_channels, int n_input_channels) const override;
+    void configure(const ITensor *weights_hwio, T *const output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
+    unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
     int get_matrix_stride(const KernelShape &kernel_shape) const override;
     void run(const Window &window, const ThreadInfo &info) override;
     bool is_parallelisable() const override;
@@ -407,7 +438,12 @@ private:
     using WinogradBase     = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
     using WinogradConv     = typename WinogradBase::template Convolution<T, T>;
     using WeightsTransform = typename WinogradBase::template WeightsTransform<T>;
-    std::unique_ptr<WeightsTransform> _transform;
+
+    const ITensor *_weights_hwio;
+    T             *_output;
+    int            _matrix_stride;
+    int            _num_output_channels;
+    int            _num_input_channels;
 };
 
 /** Interface for the NEON kernel to perform Winograd. */
@@ -421,7 +457,7 @@ public:
     /** Initialise the kernel
      *
      * @param[in]  n_gemms         Number of GEMMs to compute.
-     * @param[in]  M               in_shape.n_batches * tile_rows * tile_cols.
+     * @param[in]  M               in_shape.num_batches * tile_rows * tile_cols.
      * @param[in]  K               Number of channels in the input tensor.
      * @param[in]  N               Number of channels in the output tensor.
      * @param[in]  a_matrix_stride Stride between input matrices.
@@ -498,7 +534,7 @@ public:
     /** Initialise the kernel
      *
      * @param[in]  n_gemms         Number of GEMMs to compute.
-     * @param[in]  M               in_shape.n_batches * tile_rows * tile_cols.
+     * @param[in]  M               in_shape.num_batches * tile_rows * tile_cols.
      * @param[in]  K               Number of channels in the input tensor.
      * @param[in]  N               Number of channels in the output tensor.
      * @param[in]  a_matrix_stride Stride between input matrices.
-- 
cgit v1.2.1