COMPMID-697 - Rework GEMMLowp interface on OpenCL

Reworked the interface of GemmLowp in order to make easy the integration in Android NN - Added support for different output stage - Added validation for both matrix multiplication and output stage - Added bounded relu support in the output stage - Added in32_t bias support - Added optimized path for vector by matrix case This rework is required for: - Convolution quantized - Fully connected quantized Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
author: Gian Marco <gianmarco.iodice@arm.com> 2017-11-21 10:57:50 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:41:17 +0000
commit: 05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree: 21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
parent: c82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
download: ComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz
1 files changed, 12 insertions, 19 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index 05956aeeba..b60b80618c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -30,15 +30,15 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel to compute low precision matrix multiplication kernel
+/** OpenCL kernel to multiply matrices
  *
+ * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
  *  This kernel performs the following computation:
- *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
- *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
- *  -# Compute the int32 matrix product of the resulting a * b.
- *  -# Add output_offset to each entry of the result.
- *  -# Multiply each entry of the result and round to the nearest integer
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ *  -# Convert a values from int8 to int32
+ *  -# Convert b values from int8 to int32
+ *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
  */
 class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
 {
@@ -55,19 +55,12 @@ public:
     CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
-     * These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data types supported: U8
-     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
-     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-     * @param[in]  output_offset   Offset to be added to each element of the output matrix
-     * @param[in]  output_mult_int Offset to be added to each element of the output matrix
-     * @param[in]  shift           Number of bits to shift right the result.
+     * @param[in]  input0                    Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in]  input1                    Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
      */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
author	Gian Marco <gianmarco.iodice@arm.com>	2017-11-21 10:57:50 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:41:17 +0000
commit	05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree	21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
parent	c82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
download	ComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz