COMPMID-1247:Integrate kernel size 1x3 & 3x1 support in NEWinogradLayer.

Change-Id: I6fe198881230e49864c841a3b2366ccf2a9247f9 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145210 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Pablo Tello <pablo.tello@arm.com> 2018-08-22 11:40:33 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:54:54 +0000
commit: bda6e4b51bc4045c97100bb9d562164ba7c6c28f (patch)
tree: 8924bbae251b34dc35a4ffc9a9ece79d28c4415b /arm_compute/core/NEON/kernels
parent: 238c97cd8bfdb6dfce5c4eefed6aac4d9bb59457 (diff)
download: ComputeLibrary-bda6e4b51bc4045c97100bb9d562164ba7c6c28f.tar.gz
5 files changed, 88 insertions, 10 deletions
diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 9cdd69a70a..c71c105d92 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -159,7 +159,7 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
     /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelCols, KernelCols>;
+    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
     /** Winograd convolution kernel */
     using WinogradConv = typename WinogradBase::template Convolution<T, T>;
 
@@ -360,6 +360,21 @@ template <typename T>
 class INEWinogradLayerTransformWeightsKernel : public INEKernel
 {
 public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWinogradLayerTransformWeightsKernel(const INEWinogradLayerTransformWeightsKernel &) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWinogradLayerTransformWeightsKernel &operator=(const INEWinogradLayerTransformWeightsKernel &) = default;
+    /** Allow instances of this class to be moved */
+    INEWinogradLayerTransformWeightsKernel(INEWinogradLayerTransformWeightsKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEWinogradLayerTransformWeightsKernel &operator=(INEWinogradLayerTransformWeightsKernel &&) = default;
+
+    INEWinogradLayerTransformWeightsKernel()
+    {
+    }
+    virtual ~INEWinogradLayerTransformWeightsKernel()
+    {
+    }
     /** Determine how much memory (in units of T) to allocate for the
      * transformed weights.
      *
@@ -388,9 +403,14 @@ public:
 
     virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
 
-    virtual ~INEWinogradLayerTransformWeightsKernel()
-    {
-    }
+    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
+     *
+     * @param[in] input   First tensor input info. Data types supported: F32.
+     * @param[in] weights Weights tensor info. Data types supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
 };
 
 /** NEON kernel to perform Winograd weights transform. */
diff --git a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
index 5f42719119..25bfa332fb 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
@@ -26,7 +26,7 @@
 
 void PrintMatrix(const float *const m, const int M, const int N, const int row_stride);
 
-inline int iceildiv(const int a, const int b)
+constexpr inline int iceildiv(const int a, const int b)
 {
     return (a + b - 1) / b;
 }
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp
index 13218030d2..369c2ff48f 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp
@@ -50,6 +50,22 @@ namespace winograd
     const int matrix_row_stride  /** Stride within matrices. */
   )
   {
+    // If an Nx1 kernel then transpose and redirect to the 1xN implementation
+    if (kernel_cols == 1)
+    {
+      WinogradGEMM<output_tile_cols, output_tile_rows, kernel_cols, kernel_rows>::
+        template InputTransform<T>::execute(
+          input,
+          n_batches, in_batch_stride,
+          n_cols, in_col_stride,
+          n_rows, in_row_stride,
+          n_channels, padding,
+          tile_N, tile_M,
+          output, matrix_stride, matrix_batch_stride, matrix_row_stride
+        );
+      return;
+    }
+
     // Compute the padding required on each edge of the image
     const int pad_top = (padding == PADDING_SAME) ? (kernel_rows - 1) / 2 : 0;
     const int pad_left = (padding == PADDING_SAME) ? (kernel_cols - 1) / 2 : 0;
@@ -111,6 +127,12 @@ namespace winograd
     const int n_cols
   )
   {
+    if (kernel_cols == 1)
+    {
+      // If an Nx1 implementation then this should never be reached.
+      return;
+    }
+
     constexpr int tile_overlap = kernel_cols - 1;
 
     // Loop over columns of tiles
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp
index 700ca76c68..6ed146bf85 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp
@@ -45,6 +45,22 @@ namespace winograd
     T* const output
   )
   {
+    // If an Nx1 kernel then transpose and redirect to the 1xN implementation.
+    if (kernel_cols == 1)
+    {
+      WinogradGEMM<output_tile_cols, output_tile_rows, kernel_cols, kernel_rows>::
+        template OutputTransform<T>::execute(
+          n_batches,
+          output_batch_stride,
+          n_cols, output_col_stride,
+          n_rows, output_row_stride,
+          n_channels,
+          matrix_base, matrix_stride, matrix_row_stride,
+          biases, output
+        );
+      return;
+    }
+
     // Compute the number of tiles and hence the padding required on the bottom
     // and right of the image.
     const int tile_M = iceildiv(n_rows, output_tile_rows);
@@ -98,6 +114,12 @@ namespace winograd
     const int row_pad_right
   )
   {
+    if (kernel_cols == 1)
+    {
+      // If an Nx1 implementation then this should never be reached.
+      return;
+    }
+
     // Loop over columns of tiles
     for (int tile_j = 0; tile_j < tile_N; tile_j++)
     {
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp
index bc067fd07a..7098fc48a1 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp
@@ -49,8 +49,8 @@ class WinogradGEMM
     static constexpr int output_tile_cols = OutputTileCols;
     static constexpr int kernel_rows = KernelRows;
     static constexpr int kernel_cols = KernelCols;
-    static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;  // TODO Check
-    static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;  // TODO Check
+    static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;
+    static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;
     static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols;
 
     /** Transform weights from the spatial to the Winograd domain. */
@@ -196,8 +196,21 @@ class WinogradGEMM
           const int n_cols
         );
 
-        static constexpr int max_pad_bottom = inner_tile_rows - 1;
-        static constexpr int max_pad_right = inner_tile_cols - 1;
+        // Tile overlaps
+        static constexpr int overlap_rows = kernel_rows - 1;
+        static constexpr int overlap_cols = kernel_cols - 1;
+
+        // Maximum padding and number of distinct paddings
+        static constexpr int max_pad_top = kernel_rows / 2;
+        static constexpr int n_pad_top = 1 + iceildiv(max_pad_top, inner_tile_rows - overlap_rows);
+
+        static constexpr int max_pad_left = kernel_cols / 2;
+        static constexpr int n_pad_left = 1 + iceildiv(max_pad_left, inner_tile_cols - overlap_cols);
+
+        static constexpr int n_pad_bottom = inner_tile_rows;
+        static constexpr int n_pad_right = inner_tile_cols;
+
+
 
         /** Process a single tile of the input tensor. */
         template <int pad_top, int pad_left, int pad_bottom, int pad_right>
@@ -205,7 +218,8 @@ class WinogradGEMM
 
         // Array of methods to transform tiles of the input tensor.
         typedef void (*TileFn)(int, const T*, int, int, T*, int);
-        static const TileFn tile_fns[2][2][max_pad_bottom][max_pad_right];
+        static const TileFn
+           tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right];
 
         /* Member values for instance-based API. */
         const T* const _inptr;
author	Pablo Tello <pablo.tello@arm.com>	2018-08-22 11:40:33 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:54:54 +0000
commit	bda6e4b51bc4045c97100bb9d562164ba7c6c28f (patch)
tree	8924bbae251b34dc35a4ffc9a9ece79d28c4415b /arm_compute/core/NEON/kernels
parent	238c97cd8bfdb6dfce5c4eefed6aac4d9bb59457 (diff)
download	ComputeLibrary-bda6e4b51bc4045c97100bb9d562164ba7c6c28f.tar.gz