5 files changed, 54 insertions, 71 deletions
diff --git a/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp b/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp
index 725f6cab65..6a9984a24a 100644
--- a/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp
+++ b/arm_compute/core/NEON/kernels/winograd/direct_convolution.hpp
@@ -29,6 +29,7 @@
 void direct_convolution(
   const Tensor4D<Tensor4DShape, float>& input,
   const Tensor4D<KernelShape, float>& kernel,
+  const Tensor4D<Tensor4DShape, float>& biases,
   Tensor4D<Tensor4DShape, float>& output,
   const PaddingType padding
 );
diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp
index 39b444184e..075765a513 100644
--- a/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp
+++ b/arm_compute/core/NEON/kernels/winograd/transforms/input.hpp
@@ -71,7 +71,7 @@ namespace winograd
         const int row_offset = (tile_i == 0) ?
           0 : ((padding_type == PADDING_VALID) ? 0 : 1);
         const T* const input_base_row = (
-          input_base_batch + ((inner_tile_rows - 2)*tile_i - row_offset)*input_row_stride
+          input_base_batch + ((inner_tile_rows - (kernel_rows - 1))*tile_i - row_offset)*input_row_stride
         );
         T* const outptr_base_row = outptr_base_batch + tile_i*output_row_stride;
 
diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp
index 7fa5ee9617..0dd719751b 100644
--- a/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp
+++ b/arm_compute/core/NEON/kernels/winograd/transforms/output.hpp
@@ -35,6 +35,7 @@ namespace winograd
     const T* const matrix_base,
     const int matrix_stride,
     const int matrix_row_stride,
+    const T* const biases,
     T* const output
   )
   {
@@ -69,8 +70,9 @@ namespace winograd
         // Process the row
         process_tile_row(
           tile_N, output_shape.n_channels, matrix_tile_row, matrix_stride,
-          matrix_row_stride, outptr_row, output_row_stride,
-          output_col_stride, row_pad_bottom, pad_right
+          matrix_row_stride, biases,
+          outptr_row, output_row_stride, output_col_stride, row_pad_bottom,
+          pad_right
         );
       }
     }
@@ -85,6 +87,7 @@ namespace winograd
     const T* const matrix_base,
     const int matrix_stride,
     const int matrix_row_stride,
+    const T* const biases,
     T* const output,
     const int output_row_stride,
     const int output_col_stride,
@@ -102,7 +105,7 @@ namespace winograd
 
       // Perform the output transformation
       tile_fns[row_pad_bottom][tile_pad_right](
-        n_channels, matrix_row, matrix_stride,
+        n_channels, matrix_row, matrix_stride, biases,
         outptr, output_row_stride, output_col_stride
       );
     }
@@ -131,14 +134,17 @@ namespace winograd
     const T* const matrix_base,
     const int matrix_stride,
     const int matrix_row_stride,
+    const T* const biases,
     T* const output,
     const int n_batches,
     const int n_rows,
     const int n_cols,
     const int n_channels
-  ) : _matrix_base(matrix_base), _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride),
-      _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols), _n_channels(n_channels),
-      _tile_M(iceildiv(n_rows, output_tile_rows)), _tile_N(iceildiv(n_cols, output_tile_cols))
+  ) : _matrix_base(matrix_base), _biases(biases),
+      _matrix_stride(matrix_stride), _matrix_row_stride(matrix_row_stride),
+      _outptr(output), _n_batches(n_batches), _n_rows(n_rows), _n_cols(n_cols),
+      _n_channels(n_channels), _tile_M(iceildiv(n_rows, output_tile_rows)),
+      _tile_N(iceildiv(n_cols, output_tile_cols))
   {
   }
 
@@ -168,7 +174,8 @@ namespace winograd
       _n_batches, _n_rows, _n_cols, _n_channels, NHWC
     };
     execute(
-      output_shape, _matrix_base, _matrix_stride, _matrix_row_stride, _outptr
+      output_shape, _matrix_base, _matrix_stride, _matrix_row_stride, _biases,
+      _outptr
     );
   }
 }  // namespace winograd
diff --git a/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp b/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp
index adca48a6d6..2ea70f182b 100644
--- a/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp
+++ b/arm_compute/core/NEON/kernels/winograd/winograd_gemm.hpp
@@ -183,7 +183,7 @@ class WinogradGEMM
           const int row_pad_top,
           const int row_pad_left,
           const int row_pad_bottom,
-          const int row_pad_right
+          const int n_cols
         );
 
         static constexpr int max_pad_bottom = inner_tile_rows - 1;
@@ -225,6 +225,7 @@ class WinogradGEMM
         const T* const matrix_base,
         const int matrix_stride,
         const int matrix_row_stride,
+        const T* const biases,
         T* const output
       );
 
@@ -236,6 +237,7 @@ class WinogradGEMM
         const T* const matrix_base,   /** Pointer to base of matrices. */
         const int matrix_stride,      /** Stride between matrices. */
         const int matrix_row_stride,  /** Stride within a matrix. */
+        const T* const biases,        /** Pointer to biases vector. */
         T* const output,              /** Pointer to output tensor. */
         const int n_batches,          /** Number of batches in output tensor. */
         const int n_rows,             /** Number of rows in output tensor. */
@@ -257,6 +259,7 @@ class WinogradGEMM
           const T* const matrix_base,
           const int matrix_stride,
           const int matrix_row_stride,
+          const T* const biases,
           T* const output,
           const int output_row_stride,
           const int output_col_stride,
@@ -270,14 +273,15 @@ class WinogradGEMM
 
         /** Prepare a single tile of the output tensor. */
         template <int pad_bottom, int pad_right>
-        static void process_tile(int, const T*, int, T*, int, int);
+        static void process_tile(int, const T*, int, const T*, T*, int, int);
 
         // Array of methods to produce tiles of output tensor.
-        typedef void (*TileFn)(int, const T*, int, T*, int, int);
+        typedef void (*TileFn)(int, const T*, int, const T*, T*, int, int);
         static const TileFn tile_fns[max_pad_bottom][max_pad_right];
 
         /** Member constants for instances of the transform. */
         const T* const _matrix_base;
+        const T* const _biases;
         const int _matrix_stride, _matrix_row_stride;
         T* const _outptr;
         const int _n_batches, _n_rows, _n_cols, _n_channels, _tile_M, _tile_N;
@@ -328,6 +332,7 @@ class WinogradGEMM
         void execute(
           TOut* const output,
           const TIn* const input,
+          const TOut* const biases,
           void* working_space=NULL,
           const int n_threads=1
         );
@@ -336,6 +341,7 @@ class WinogradGEMM
         void execute(
           TOut* const output,
           const TIn* const input,
+          const TOut* const biases,
           const int n_threads
         );
 
diff --git a/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp
index 4559312df4..1db63d750b 100644
--- a/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp
+++ b/arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp
@@ -74,87 +74,56 @@ class WinogradConvolutionLayer
 
     /** Determine how much memory (in units of TIn) to allocate for the
      * transformed weights.
-     *
-     * @param[in]  n_output_channels Number of output feature maps.
-     * @param[in]  n_input_channels  Number of input feature maps.
      */
     static unsigned int get_weight_storage_size(
-      const int n_output_channels,
-      const int n_input_channels
+      const int n_output_channels,  /** Number of output feature maps. */
+      const int n_input_channels    /** Number of input feature maps. */
     );
 
     /** Determine how much memory (in units of TIn) to allocate for the
      * transformed input.
-     *
-     * @param[in]  n_batches  Number of batches in the input tensor.
-     * @param[in]  n_channels Number of feature maps in the input tensor.
-     * @param[in]  n_rows Number of rows in each feature map.
-     * @param[in]  n_cols Number of columns in each feature map.
-     * @param[in]  same_padding Use "SAME" padding, otherwise use "VALID".
      */
     static unsigned int get_input_storage_size(
-      const int n_batches,
-      const int n_channels,
-      const int n_rows,
-      const int n_cols,
-      const bool same_padding
+      const int n_batches,     /** Number of batches in the input tensor. */
+      const int n_channels,    /** Number of feature maps in the input tensor. */
+      const int n_rows,        /** Number of rows in each feature map. */
+      const int n_cols,        /** Number of columns in each feature map. */
+      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
     );
 
     /** Determine how much memory (in units of TOut) to allocate for the
      * (Winograd domain) output.
-     *
-     * @param[in]  n_batches  Number of batches in the output tensor.
-     * @param[in]  n_rows Number of rows in each feature map of the input tensor.
-     * @param[in]  n_cols Number of columns in each feature map of the input tensor.
-     * @param[in]  n_output_channels Number of feature maps in the output tensor.
-     * @param[in]  same_padding Use "SAME" padding, otherwise use "VALID".
      */
     static unsigned int get_output_storage_size(
-      const int n_batches,
-      const int n_rows,
-      const int n_cols,
-      const int n_output_channels,
-      const bool same_padding
+      const int n_batches,          /** Number of batches in the output tensor. */
+      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
+      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
+      const int n_output_channels,  /** Number of feature maps in the output tensor. */
+      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
     );
 
-    /** Get the shape (rows, cols) of a feature map of the output tensor.
-     *
-     * @param[in]  n_input_rows  Number of rows in the input feature map.
-     * @param[in]  n_input_cols  Number of columns in the input feature map.
-     * @param[in]  same_padding Use "SAME" padding, otherwise use "VALID".
-    */
+    /** Get the shape (rows, cols) of a feature map of the output tensor. */
     static std::pair<int, int> get_output_feature_map_shape(
-      const int n_input_rows,
-      const int n_input_cols,
-      const bool same_padding
+      const int n_input_rows,  /** Number of rows in the input feature map. */
+      const int n_input_cols,  /** Number of columns in the input feature map. */
+      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
     );
 
     /** Create a new Winograd convolution layer.
-     * @param[in]  n_batches Number of batches in the input and output tensors.
-     * @param[in]  n_input_channels Number of feature maps in a batch of the input tensor.
-     * @param[in]  n_input_rows Number of rows in a feature map of the input tensor.
-     * @param[in]  n_input_cols Number of columns in a feature map of the input tensor.
-     * @param[in]  n_output_channels Number of feature maps in the output tensor.
-     * @param[in]  same_padding Use "SAME" padding, otherwise use "VALID".
-     * @param[in]  weights Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps.
-     * @param[out]  weights_storage Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size
-     * @param[in]  input Pointer to NHWC ordered input tensor, in the spatial domain.
-     * @param[out] winograd_input Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`.
-     * @param[out]  output Pointer to NHWC ordered output tensor, in the spatial domain.
-     * @param[out]  winograd_output Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`.
      */
     WinogradConvolutionLayer(
-      const int n_batches,
-      const int n_input_channels,
-      const int n_input_rows,
-      const int n_input_cols,
-      const int n_output_channels,
-      const bool same_padding,
-      const TIn* const weights,
-      TIn* const weights_storage,
-      const TIn* const input,
-      TIn* const winograd_input,
-      TOut* const output,
-      TOut* const winograd_output
+      const int n_batches,          /** Number of batches in the input and output tensors. */
+      const int n_input_channels,   /** Number of feature maps in a batch of the input tensor. */
+      const int n_input_rows,       /** Number of rows in a feature map of the input tensor. */
+      const int n_input_cols,       /** Number of columns in a feature map of the input tensor. */
+      const int n_output_channels,  /** Number of feature maps in the output tensor. */
+      const bool same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
+      const TIn* const weights,     /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
+      TIn* const weights_storage,   /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
+      const TIn* const input,       /** Pointer to NHWC ordered input tensor, in the spatial domain. */
+      TIn* const winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
+      const TOut* const biases,     /** Pointer to biases vector. */
+      TOut* const output,           /** Pointer to NHWC ordered output tensor, in the spatial domain. */
+      TOut* const winograd_output   /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
     );
 };