COMPMID-1413 - Improve the performance of GEMMLowp with 8 bit dot product on OpenCL

COMPMID-1424 - Add dot product support for CLDepthwise QASYMM8 3x3 NHWC non-unit stride With this patch we are able to improve the performance of MobileNet v1-qasymm8 by 37 % Tried to use the dot product instruction in CLDepthwise QASYMM8 3x3 NHWC non-unit stride but I have not seen any benefit (maybe because we have few arithemtic operation and we do not have more load instructions). However Depthwise convolution has been improved by 30% Change-Id: Id768a99c2e53a04276707e427af5d0ec93419ada Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/155082 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2018-10-18 10:21:02 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:55:45 +0000
commit: 4b90865ab985d571f70c60583cdfb8c7a65f1670 (patch)
tree: f116a4ffef5f5e823689dd00c1e5c9d987f3d295 /arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
parent: c55beee7ef70fa08a5d217619083b288a74fcb27 (diff)
download: ComputeLibrary-4b90865ab985d571f70c60583cdfb8c7a65f1670.tar.gz
1 files changed, 31 insertions, 23 deletions
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index f404ccdf4c..82f307a773 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
@@ -45,7 +46,8 @@ class ICLTensor;
  *  -# @ref CLGEMMLowpMatrixMultiplyKernel
  *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
  *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- *  -# @ref CLGEMMLowpOffsetContributionKernel
+ *  -# @ref CLGEMMLowpOffsetContributionKernel (if gemm_info.gemmlowp_output_stage == NONE)
+ *  -# @ref CLGEMMLowpOffsetContributionOutputStageKernel (if gemm_info.gemmlowp_output_stage != NONE)
  *
 */
 class CLGEMMLowpMatrixMultiplyCore : public IFunction
@@ -63,54 +65,60 @@ public:
     CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default;
     /** Initialise the kernel's inputs, output
      *
-     * @note GEMM_LOWP:  low precision GEMM kernel
+     * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
      *  This kernel performs the following computations:
      *
      *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
      *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
+     *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
      *
      * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[out] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
+     * @param[out] output    Output tensor. Data type supported: S32 or QASYMM8 if gemm_info.gemmlowp_output_stage != NONE
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
      *
      * @param[in] a         First input tensor  (Matrix A). Data type supported: QASYMM8.
      * @param[in] b         Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[in] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in] c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
+     * @param[in] output    Output tensor. Data type supported: S32 or QASYMM8 if gemm_info.gemmlowp_output_stage != NONE
      * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                      if the reshape of matrix B should be executed only for the first run
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    CLMemoryGroup                      _memory_group;
-    CLGEMMLowpMatrixMultiplyKernel     _mm_kernel;
-    CLGEMMInterleave4x4Kernel          _mtx_a_reshape_kernel;
-    CLGEMMTranspose1xWKernel           _mtx_b_reshape_kernel;
-    CLGEMMLowpMatrixAReductionKernel   _mtx_a_reduction_kernel;
-    CLGEMMLowpMatrixBReductionKernel   _mtx_b_reduction_kernel;
-    CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
-    CLTensor                           _vector_sum_col;
-    CLTensor                           _vector_sum_row;
-    CLTensor                           _tmp_a;
-    CLTensor                           _tmp_b;
-    const ICLTensor                   *_original_b;
-    int32_t                            _a_offset;
-    int32_t                            _b_offset;
-    bool                               _is_interleaved_transposed;
-    bool                               _reshape_b_only_on_first_run;
-    bool                               _is_prepared;
+    CLMemoryGroup                                 _memory_group;
+    CLGEMMLowpMatrixMultiplyKernel                _mm_kernel;
+    CLGEMMInterleave4x4Kernel                     _mtx_a_reshape_kernel;
+    CLGEMMTranspose1xWKernel                      _mtx_b_reshape_kernel;
+    CLGEMMLowpMatrixAReductionKernel              _mtx_a_reduction_kernel;
+    CLGEMMLowpMatrixBReductionKernel              _mtx_b_reduction_kernel;
+    CLGEMMLowpOffsetContributionKernel            _offset_contribution_kernel;
+    CLGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+    CLTensor                                      _vector_sum_col;
+    CLTensor                                      _vector_sum_row;
+    CLTensor                                      _tmp_a;
+    CLTensor                                      _tmp_b;
+    CLTensor                                      _mm_result_s32;
+    const ICLTensor                              *_original_b;
+    int32_t                                       _a_offset;
+    int32_t                                       _b_offset;
+    bool                                          _is_interleaved_transposed;
+    bool                                          _reshape_b_only_on_first_run;
+    bool                                          _is_prepared;
+    bool                                          _fuse_output_stage;
 };
 }
 #endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2018-10-18 10:21:02 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:55:45 +0000
commit	4b90865ab985d571f70c60583cdfb8c7a65f1670 (patch)
tree	f116a4ffef5f5e823689dd00c1e5c9d987f3d295 /arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
parent	c55beee7ef70fa08a5d217619083b288a74fcb27 (diff)
download	ComputeLibrary-4b90865ab985d571f70c60583cdfb8c7a65f1670.tar.gz