From ed5a492ba791d8c8b3334749d4ae946b8f11d13d Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Thu, 13 Sep 2018 16:22:01 +0100
Subject: COMPMID-1586: Add support for NHWC CLDeconvolutionLayer

COMPMID-1651: Fix QASYMM8 CLDeconvolutionLayer

This patch also extends the range of values used for testing Convolution and
Deconvolution to cover quantized [-1.0f, 1.0f].

Change-Id: I8b280669db67bb3ec25bf5d411c8f5954f5b0dab
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/149869
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
---
 .../core/CPP/kernels/CPPFlipWeightsKernel.h        |  8 ++---
 arm_compute/core/Utils.h                           | 10 ------
 arm_compute/core/utils/misc/ShapeCalculator.h      | 39 +++++++++++++++++-----
 .../runtime/CL/functions/CLDeconvolutionLayer.h    | 17 ++++++++--
 .../runtime/NEON/functions/NEDeconvolutionLayer.h  | 17 +++++++---
 5 files changed, 60 insertions(+), 31 deletions(-)

(limited to 'arm_compute')
diff --git a/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h b/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h
index 801934159d..04567ed959 100644
--- a/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h
@@ -53,7 +53,7 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to flip. Data types supported: QASYMM8/F16/F32
+     * @param[in]  input  The input tensor to flip. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[out] output The output tensor. Data types supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
@@ -64,17 +64,15 @@ public:
     /** Function to perform flipping.
      *
      * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
      */
     template <typename T>
-    void flip_weights(const Window &window_input, const Window &window);
+    void flip_weights(const Window &window_input);
 
     /** Common signature for all the specialised Flip functions
      *
      * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
      */
-    using FlipWeightsFunction = void (CPPFlipWeightsKernel::*)(const Window &window_input, const Window &window);
+    using FlipWeightsFunction = void (CPPFlipWeightsKernel::*)(const Window &window_input);
 
 private:
     const ITensor      *_input;
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 7ee24e2736..cfd273618c 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -815,16 +815,6 @@ inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t siz
  */
 PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info);
 
-/** Returns expected shape for the deconvolution output tensor.
- *
- * @param[in] out_dims widht and height of the output tensor, these values can be obtained with the function deconvolution_output_dimensions.
- * @param[in] input    Shape of the input tensor.
- * @param[in] weights  Shape of the weights tensor.
- *
- * @return Deconvolution output tensor shape.
- */
-TensorShape deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights);
-
 /** Returns expected width and height of the deconvolution's output tensor.
  *
  * @param[in] in_width      Width of input tensor (Number of columns)
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index f68401c1b9..11d20c919f 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -229,26 +229,49 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     return output_shape;
 }
 
-inline TensorShape compute_deconvolution_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy, unsigned int inner_border_right, unsigned int inner_border_top,
-                                               std::pair<unsigned int, unsigned int> &out_dims)
+inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy, unsigned int inner_border_right,
+                                                         unsigned int inner_border_top,
+                                                         std::pair<unsigned int, unsigned int> &out_dims, unsigned int &padx, unsigned int &pady)
 {
+    const DataLayout data_layout = input.data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
     // Find the upsampled dimensions
-    unsigned int out_x = (input.dimension(0) - 1) * sx + inner_border_right + 1;
-    unsigned int out_y = (input.dimension(1) - 1) * sy + inner_border_top + 1;
+    unsigned int out_x = (input.dimension(idx_w) - 1) * sx + inner_border_right + 1;
+    unsigned int out_y = (input.dimension(idx_h) - 1) * sy + inner_border_top + 1;
 
     // Find the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int padx = out_dims.first - (out_x - weights.dimension(0) + 1);
-    unsigned int pady = out_dims.second - (out_y - weights.dimension(1) + 1);
+    padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
+    pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
     out_x += padx;
     out_y += pady;
 
     TensorShape scale_out_shape(input.tensor_shape());
-    scale_out_shape.set(0, out_x);
-    scale_out_shape.set(1, out_y);
+    scale_out_shape.set(idx_w, out_x);
+    scale_out_shape.set(idx_h, out_y);
 
     return scale_out_shape;
 }
 
+inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, const ITensorInfo &input, const ITensorInfo &weights)
+{
+    const TensorShape input_shape{ input.tensor_shape() };
+    const TensorShape weights_shape{ weights.tensor_shape() };
+
+    const DataLayout data_layout = input.data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        batch_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    TensorShape out_shape{ input_shape };
+    out_shape.set(width_idx, out_dims.first);
+    out_shape.set(height_idx, out_dims.second);
+    out_shape.set(channel_idx, weights_shape[batch_idx]);
+    return out_shape;
+}
+
 inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
                                              unsigned int num_groups = 1)
 {
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 6716cd6fdd..39cbe0cafa 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -46,8 +46,12 @@ class ICLTensor;
  * specified value where a < stride - 1, that increases the padding top and right of the input image.
  *
  *  The relation between input to output is as follows:
- *      width_output = round((width_input − 1) ∗ (stride_x - 1) − 2 ∗ padding_x + kernel_x + inner_border_right )
- *      height_output = round((height_input − 1) ∗ (stride_y - 1) − 2 ∗ padding_y + kernel_y + inner_border_top )
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
  *
  *  where:
  *      width_input is the size of the first input dimension.
@@ -55,9 +59,16 @@ class ICLTensor;
  *      width_output is the size of the first output dimension.
  *      height_output is the size of the second output dimension.
  *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      inner_border_right and inner_border_top the number of zeros added to the right and top edges of the input.
  *      stride_x and stride_y is the input stride of the first and second dimension.
  *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref CPPFlipWeightsKernel.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
  */
 class CLDeconvolutionLayer : public IFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 0cca555621..73870093b7 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -46,8 +46,12 @@ namespace arm_compute
  * specified value where a < stride - 1 that increases the padding top and right of the input image.
  *
  *  The relation between input to output is as follows:
- *       width_output = round((width_input − 1) ∗ (stride_x - 1) − 2 ∗ padding_x + kernel_x + inner_border_right )
- *       height_output = round((height_input − 1) ∗ (stride_y - 1) − 2 ∗ padding_y + kernel_y + inner_border_top )
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
  *
  *  where
  *      width is the size of the first input dimension.
@@ -55,12 +59,15 @@ namespace arm_compute
  *      width_output is the size of the first output dimension.
  *      height_output is the size of the second output dimension.
  *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      inner_border_right and inner_border_top the number of zeros added to the top and right edges of the input.
  *      stride_x and stride_y is the input stride of the first and second dimension.
  *
- *  This function calls the following NEON kernels:
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref CPPFlipWeightsKernel.
  *
- * -# @ref NEDirectConvolutionLayer
+ * This function calls the following NEON kernels/functions:
+ *
+ * -# @ref CPPUpsample
+ * -# @ref NEConvolutionLayer
  *
  */
 class NEDeconvolutionLayer : public IFunction
-- 
cgit v1.2.1