From 5cb4c42cb5d781a44409ebc97a408e1379ce182d Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 23 Jun 2017 10:38:25 +0100
Subject: COMPMID-414 - Port CLConvolutionLayer to support 8 bit fixed point -
 CLWeightsReshapeKernel

Change-Id: Ie32e6bdd557a8243eb9988aa7eab4e4ca2291e79
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78701
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
---
 .../core/CL/kernels/CLWeightsReshapeKernel.h       | 58 ++-------------
 .../runtime/CL/functions/CLConvolutionLayer.h      | 14 ++--
 .../runtime/CL/functions/CLLocallyConnectedLayer.h | 18 ++---
 docs/00_introduction.dox                           |  2 +-
 src/core/CL/CLHelpers.cpp                          |  2 +
 src/core/CL/kernels/CLWeightsReshapeKernel.cpp     | 86 +++++++---------------
 src/core/NEON/kernels/NEWeightsReshapeKernel.cpp   | 23 ++----
 7 files changed, 60 insertions(+), 143 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index 1dc8a8b80e..0d00f0e00e 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -31,11 +31,8 @@ namespace arm_compute
 class CLWeightsReshapeKernel : public ICLKernel
 {
 public:
-    /** Constructor.
-     *
-     * @param[in] is_shared Flag to indicate whether the weights are shared or not.
-     */
-    CLWeightsReshapeKernel(bool is_shared = false);
+    /** Constructor.*/
+    CLWeightsReshapeKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -50,7 +47,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: F16, F32
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QS8/F16/F32
      * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
@@ -58,57 +55,12 @@ public:
     void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output);
 
     // Inherited methods overridden:
-    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+    void run(const Window &window, cl::CommandQueue &queue) override;
 
-protected:
-    bool             _is_shared;
+private:
     const ICLTensor *_input;
     const ICLTensor *_biases;
     ICLTensor       *_output;
 };
-
-/** Interface for the weights reshape kernel used by convolution and fully connected layers.
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref CLIm2ColKernel can transform a convolution into a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CLConvolutionLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
-{
-public:
-    /** Default constructor */
-    CLConvolutionLayerWeightsReshapeKernel();
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface for the weights reshape kernel used by locally connected layers. */
-class CLLocallyConnectedLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
-{
-public:
-    /** Default constructor */
-    CLLocallyConnectedLayerWeightsReshapeKernel();
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
 }
 #endif /*__ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 6a40396f9a..8030b40a71 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -53,7 +53,7 @@ public:
     CLConvolutionLayerReshapeWeights();
     /** Set the input and output tensors.
      *
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/F16/F32.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
      * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
@@ -64,16 +64,16 @@ public:
     void run() override;
 
 private:
-    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
-    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
-    CLTensor                               _weights_reshaped;
-    bool                                   _transpose1xW;
+    CLWeightsReshapeKernel   _weights_reshape_kernel;
+    CLGEMMTranspose1xWKernel _weights_transposed_kernel;
+    CLTensor                 _weights_reshaped;
+    bool                     _transpose1xW;
 };
 
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels:
  *
- * -# @ref CLConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
- * -# @ref CLGEMMTranspose1xWKernel               (executed only once for each configuration)
+ * -# @ref CLWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLGEMMTranspose1xWKernel (executed only once for each configuration)
  * -# @ref CLIm2ColKernel
  * -# @ref CLGEMMInterleave4x4Kernel
  * -# @ref CLGEMMMatrixMultiplyKernel
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index b4e469196e..5f4f1ba1d7 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -39,7 +39,7 @@ class ICLTensor;
 
 /** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
  *
- * -# @ref CLLocallyConnectedLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLWeightsReshapeKernel (executed only once for each configuration)
  * -# @ref CLIm2ColKernel
  * -# @ref CLLocallyConnectedMatrixMultiplyKernel
  * -# @ref CLCol2ImKernel
@@ -66,14 +66,14 @@ public:
     void run() override;
 
 private:
-    CLIm2ColKernel                              _input_im2col_kernel;
-    CLLocallyConnectedLayerWeightsReshapeKernel _weights_reshape_kernel;
-    CLLocallyConnectedMatrixMultiplyKernel      _mm_kernel;
-    CLCol2ImKernel                              _output_col2im_kernel;
-    CLTensor                                    _input_im2col_reshaped;
-    CLTensor                                    _weights_reshaped;
-    CLTensor                                    _gemm_output;
-    bool                                        _is_first_run;
+    CLIm2ColKernel                         _input_im2col_kernel;
+    CLWeightsReshapeKernel                 _weights_reshape_kernel;
+    CLLocallyConnectedMatrixMultiplyKernel _mm_kernel;
+    CLCol2ImKernel                         _output_col2im_kernel;
+    CLTensor                               _input_im2col_reshaped;
+    CLTensor                               _weights_reshaped;
+    CLTensor                               _gemm_output;
+    bool                                   _is_first_run;
 };
 }
 #endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 1fb94ed637..d4b4b6f10d 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -205,7 +205,7 @@ v17.02.1 Sources preview
  - New OpenCL kernels / functions:
    - @ref arm_compute::CLLogits1DMaxKernel, @ref arm_compute::CLLogits1DShiftExpSumKernel, @ref arm_compute::CLLogits1DNormKernel / @ref arm_compute::CLSoftmaxLayer
    - @ref arm_compute::CLPoolingLayerKernel / @ref arm_compute::CLPoolingLayer
-   - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, @ref arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer
+   - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer
    - @ref arm_compute::CLRemapKernel / @ref arm_compute::CLRemap
    - @ref arm_compute::CLGaussianPyramidHorKernel, @ref arm_compute::CLGaussianPyramidVertKernel / @ref arm_compute::CLGaussianPyramid, @ref arm_compute::CLGaussianPyramidHalf, @ref arm_compute::CLGaussianPyramidOrb
    - @ref arm_compute::CLMinMaxKernel, @ref arm_compute::CLMinMaxLocationKernel / @ref arm_compute::CLMinMaxLocation
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 4b5bbbbb49..835260d35a 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -80,6 +80,8 @@ std::string get_cl_type_from_data_type(const DataType &dt)
             return "ushort";
         case DataType::S16:
             return "short";
+        case DataType::QS16:
+            return "qs16";
         case DataType::U32:
             return "uint";
         case DataType::S32:
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 018f272921..845bd3799d 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -34,32 +34,38 @@
 
 using namespace arm_compute;
 
-CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
-    : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+CLWeightsReshapeKernel::CLWeightsReshapeKernel()
+    : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
 }
 
 void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    if(_is_shared)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
-        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
-        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
-        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
-    }
-
-    // Check biases
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
-    }
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+    ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
+    ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2));
+    ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3]));
+    ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4]));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _biases = biases;
     _output = output;
@@ -88,43 +94,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
     ICLKernel::configure(win);
 }
 
-CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
-    : CLWeightsReshapeKernel(false)
-{
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    // Set arguments
-    unsigned idx = 0;
-    add_3D_tensor_argument(idx, _input, in_slice);
-    add_2D_tensor_argument(idx, _output, out_slice);
-    if(_biases != nullptr)
-    {
-        Window biases_slice;
-        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
-        add_1D_tensor_argument(idx, _biases, biases_slice);
-    }
-
-    // Run kernel
-    enqueue(queue, *this, in_slice);
-}
-
-CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
-    : CLWeightsReshapeKernel(true)
-{
-}
-
-void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 4f52bf6279..e9b76e7967 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -95,7 +95,7 @@ NEWeightsReshapeKernel::NEWeightsReshapeKernel()
 
 void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
 
@@ -108,28 +108,21 @@ void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias
     output_shape.set(0, output_shape[1]);
     output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
 
-    // Set data type and shape for output tensor if not yet configured
-    set_data_type_if_unknown(*output->info(), dt);
-    set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     if(bias != nullptr)
     {
-        TensorShape bias_shape{ input->info()->tensor_shape()[3] };
-
-        // Set data type and shape for bias tensor if not yet configured
-        set_data_type_if_unknown(*bias->info(), dt);
-        set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
-        set_shape_if_empty(*bias->info(), bias_shape);
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::F32, DataType::QS8);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->num_dimensions() != 1));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->num_dimensions() != 2));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3]));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3] || bias->info()->dimension(1) != input->info()->tensor_shape()[4]));
     }
 
     _input  = input;
-- 
cgit v1.2.1