COMPMID-1276 - Allow GEMM to work with 3D input tensor

Skipped im2col in CLGEMMConvolutionLayer for 1x1 convolutions with NHWC data layout Change-Id: I894e6b952ed8605e8f3ffc0ffc25c24730d4664c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141909 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2018-07-26 11:44:03 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:54:54 +0000
commit: 68a3f56627b04acdefebe67d645727dd83889766 (patch)
tree: 4a3f4dc0facfda861a5ba7afa29d84d82d0829c2 /arm_compute
parent: 4e0d3819be6c61cc00c7e0fa9b4b740738c703b7 (diff)
download: ComputeLibrary-68a3f56627b04acdefebe67d645727dd83889766.tar.gz
5 files changed, 75 insertions, 37 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index 7f8e766f1a..4592fc2921 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -67,17 +67,19 @@ public:
      * @param[in]  input                     Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output                    Output tensor. Data type supported: same as @p input
      * @param[in]  mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
+     * @param[in]  reinterpret_input_as_3d   (Optional) True if the input has to be reinterpreted as 3D tensor
      */
-    void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1);
+    void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
      *
      * @param[in] input                     Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output                    Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
      * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
+     * @param[in] reinterpret_input_as_3d   (Optional) True if the input has to be reinterpreted as 3D tensor
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -85,6 +87,7 @@ public:
 private:
     const ICLTensor *_input;
     ICLTensor       *_output;
+    bool             _reinterpret_input_as_3d;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 1b6a0c87a9..e030fa2d2a 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -85,7 +85,8 @@ public:
     const ICLTensor *_input1;
     ICLTensor       *_output;
     bool             _slide_matrix_b;
-    bool             _is_gemm3d;
+    bool             _reinterpret_input_as_3d;
+    bool             _reinterpret_output_as_3d;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 00370918bd..81d652dd7d 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1031,7 +1031,7 @@ class GEMMReshapeInfo final
 public:
     /** Default constructor */
     GEMMReshapeInfo()
-        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(1)
+        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(1), _reinterpret_input_as_3d(false)
     {
     }
     /** Constructor
@@ -1042,9 +1042,12 @@ public:
      * @param[in] mult_transpose1xW_width   (Optional) Multiplication factor for the width of the 1xW transposed block
      * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block
      * @param[in] depth_output_gemm3d       (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
+     * @param[in] reinterpret_input_as_3d   (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
+     *                                                 to perform 1x1 convolutions with the NHWC data layout)
      */
-    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 1)
-        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d)
+    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 1, bool reinterpret_input_as_3d = false)
+        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d)
     {
     }
     /** Number of matrix A rows
@@ -1098,14 +1101,23 @@ public:
     {
         return _depth_output_gemm3d;
     }
+    /** Flag which specifies if the input tensor has to be reinterpreted as 3D
+     *
+     * @return True if the input tensor has to be reinterpreted as 3D tensor
+     */
+    bool reinterpret_input_as_3d() const
+    {
+        return _reinterpret_input_as_3d;
+    };
 
 private:
-    const int _m;
-    const int _n;
-    const int _k;
-    const int _mult_transpose1xW_width;
-    const int _mult_interleave4x4_height;
-    const int _depth_output_gemm3d;
+    const int  _m;
+    const int  _n;
+    const int  _k;
+    const int  _mult_transpose1xW_width;
+    const int  _mult_interleave4x4_height;
+    const int  _depth_output_gemm3d;
+    const bool _reinterpret_input_as_3d;
 };
 
 /** GEMM information class. This class stores the necessary information to compute GEMM functions
@@ -1118,7 +1130,7 @@ class GEMMInfo
 public:
     /** Default constructor */
     GEMMInfo()
-        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _depth_output_gemm3d(1)
+        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _depth_output_gemm3d(1), _reinterpret_input_as_3d(false)
     {
     }
     /** Constructor
@@ -1127,10 +1139,13 @@ public:
      * @param[in] is_b_reshaped               True if the matrix B has been reshaped
      * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
      * @param[in] depth_output_gemm3d         (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
+     * @param[in] reinterpret_input_as_3d     (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
+     *                                        to perform 1x1 convolutions with the NHWC data layout)
      *
      */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 1)
-        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _depth_output_gemm3d(depth_output_gemm3d)
+    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 1, bool reinterpret_input_as_3d = false)
+        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d)
     {
     }
     /** Flag which specifies if the matrix A has been reshaped
@@ -1167,12 +1182,21 @@ public:
     {
         return _depth_output_gemm3d;
     };
+    /** Flag which specifies if the input tensor has to be reinterpreted as 3D
+     *
+     * @return True if the input tensor has to be reinterpreted as 3D tensor
+     */
+    bool reinterpret_input_as_3d() const
+    {
+        return _reinterpret_input_as_3d;
+    };
 
 private:
     const bool _is_a_reshaped;
     const bool _is_b_reshaped;
     const bool _reshape_b_only_on_first_run;
     const int  _depth_output_gemm3d;
+    const bool _reinterpret_input_as_3d;
 };
 
 /** Winograd information */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index dbf26a423d..bf55add1d2 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -66,14 +66,24 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
 
     return weights_reshaped;
 }
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1)
+inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
 {
     // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
     ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
     const int   interleave_width = 4 * mult_interleave4x4_height;
     TensorShape shape_interleaved_a{ a.tensor_shape() };
     shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
-    shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width)));
+    if(reinterpret_input_as_3d)
+    {
+        const int M      = a.dimension(1) * a.dimension(2);
+        const int height = std::ceil(M / static_cast<float>(interleave_width));
+        shape_interleaved_a.set(1, height);
+        shape_interleaved_a.remove_dimension(2);
+    }
+    else
+    {
+        shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width)));
+    }
 
     return shape_interleaved_a;
 }
@@ -374,23 +384,26 @@ inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned in
 inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
+    ARM_COMPUTE_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
 
-    const bool is_gemm3d = reshape_info.depth_output_gemm3d() != 1;
+    const bool reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
+    const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 1;
+    const int  m                        = reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
     const int dim0 = is_interleaved_transposed ? reshape_info.n() : input1.dimension(0);
-    const int dim1 = is_interleaved_transposed ? reshape_info.m() / reshape_info.depth_output_gemm3d() : input0.dimension(1) / reshape_info.depth_output_gemm3d();
-    const int dim2 = input0.tensor_shape()[2];
-    const int dim3 = input0.tensor_shape()[3];
+    const int dim1 = is_interleaved_transposed ? reshape_info.m() / reshape_info.depth_output_gemm3d() : m / reshape_info.depth_output_gemm3d();
+    const int dim2 = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
+    const int dim3 = reinterpret_input_as_3d ? 1 : input0.tensor_shape()[3];
 
     TensorShape output_shape{ input0.tensor_shape() };
 
     output_shape.set(0, dim0);
     output_shape.set(1, dim1);
-    output_shape.set(2, is_gemm3d ? reshape_info.depth_output_gemm3d() : dim2);
-    output_shape.set(3, is_gemm3d ? dim2 : dim3);
-    output_shape.set(4, is_gemm3d ? dim3 : 1);
+    output_shape.set(2, reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : dim2);
+    output_shape.set(3, reinterpret_output_as_3d ? dim2 : dim3);
+    output_shape.set(4, reinterpret_output_as_3d ? dim3 : 1);
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 09daa5f568..7c272a348b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -26,8 +26,8 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
@@ -83,18 +83,12 @@ private:
 
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
  *
- * Note: weights already reshaped for quantized asymmetric is not supported
- *
  * -# @ref CLIm2ColKernel
- * -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric)
- * -# @ref CLCol2ImKernel
- *
- * if the weights are already reshaped:
- * -# @ref CLGEMMInterleave4x4Kernel
- * -# @ref CLGEMMMatrixMultiplyKernel
- * else
- * -# @ref CLGEMM
+ * -# @ref CLGEMM (if the data type is FP32 or FP16)
+ * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8)
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8)
+ * -# @ref CLArithmeticAdditionKernel (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
+ * -# @ref CLCol2ImKernel (if NCHW data layout)
  */
 class CLGEMMConvolutionLayer : public IFunction
 {
@@ -172,10 +166,11 @@ private:
      * @param[in] output        Output tensor. Data types supported: Same as @p input,
      *                          except for input of QASYMM8 type where output should be of S32 type.
      * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+     * @param[in] skip_im2col   (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth = 1);
+    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth = 1, bool skip_im2col = false);
 
 private:
     CLMemoryGroup                                       _memory_group;
@@ -186,6 +181,7 @@ private:
     CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
     CLCol2ImKernel                                      _col2im_kernel;
     CLActivationLayer                                   _activationlayer_function;
+    CLArithmeticAdditionKernel                          _add_bias_kernel;
 
     const ICLTensor *_original_weights;
 
@@ -196,6 +192,7 @@ private:
 
     DataLayout _data_layout;
 
+    bool _append_bias;
     bool _skip_im2col;
     bool _is_quantized;
     bool _is_activationlayer_enabled;
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2018-07-26 11:44:03 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:54:54 +0000
commit	68a3f56627b04acdefebe67d645727dd83889766 (patch)
tree	4a3f4dc0facfda861a5ba7afa29d84d82d0829c2 /arm_compute
parent	4e0d3819be6c61cc00c7e0fa9b4b740738c703b7 (diff)
download	ComputeLibrary-68a3f56627b04acdefebe67d645727dd83889766.tar.gz