COMPMID-697 - Rework GEMMLowp interface on OpenCL

Reworked the interface of GemmLowp in order to make easy the integration in Android NN - Added support for different output stage - Added validation for both matrix multiplication and output stage - Added bounded relu support in the output stage - Added in32_t bias support - Added optimized path for vector by matrix case This rework is required for: - Convolution quantized - Fully connected quantized Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
author: Gian Marco <gianmarco.iodice@arm.com> 2017-11-21 10:57:50 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:41:17 +0000
commit: 05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree: 21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /src/core/CL/cl_kernels/gemm.cl
parent: c82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
download: ComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz
1 files changed, 0 insertions, 104 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 15111ed352..c763cb355b 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -251,110 +251,6 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
 }
 
 #if defined(COLS_B)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
- *
- * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  a_offset                           Offset to be added to each element of the matrix A
- * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
- * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
- * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
- * @param[in]  shift                              Number of bits to shift right the result.
- */
-__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
-                                                IMAGE_DECLARATION(src1),
-                                                IMAGE_DECLARATION(dst),
-                                                int a_offset,
-                                                int b_offset,
-                                                int c_offset,
-                                                int c_mult_int,
-                                                int shift)
-{
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
-
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
-
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
-
-    // Reset accumulators
-    int16 c00 = 0.0f;
-    int16 c10 = 0.0f;
-    int16 c20 = 0.0f;
-    int16 c30 = 0.0f;
-
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
-
-        int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
-
-        c00 += (int16)a0.s4 * b1;
-        c10 += (int16)a0.s5 * b1;
-        c20 += (int16)a0.s6 * b1;
-        c30 += (int16)a0.s7 * b1;
-    }
-
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Multiply by the weight of matrix product
-    c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
-    c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
-    c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
-    c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
-
-    // Store 4x16 block
-    vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
-    vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
-    vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
-    vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
-}
-
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
author	Gian Marco <gianmarco.iodice@arm.com>	2017-11-21 10:57:50 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:41:17 +0000
commit	05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree	21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /src/core/CL/cl_kernels/gemm.cl
parent	c82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
download	ComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz