diff options
author | Gian Marco <gianmarco.iodice@arm.com> | 2017-11-21 10:57:50 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:41:17 +0000 |
commit | 05288a2b871ef99f544771621c3bba409b2f70df (patch) | |
tree | 21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /src/core/CL/cl_kernels/gemm.cl | |
parent | c82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff) | |
download | ComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz |
COMPMID-697 - Rework GEMMLowp interface on OpenCL
Reworked the interface of GemmLowp in order to make easy the integration
in Android NN
- Added support for different output stage
- Added validation for both matrix multiplication and output stage
- Added bounded relu support in the output stage
- Added in32_t bias support
- Added optimized path for vector by matrix case
This rework is required for:
- Convolution quantized
- Fully connected quantized
Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/gemm.cl')
-rw-r--r-- | src/core/CL/cl_kernels/gemm.cl | 104 |
1 files changed, 0 insertions, 104 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 15111ed352..c763cb355b 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -251,110 +251,6 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), } #if defined(COLS_B) -/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) - * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication - * - * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B - * - * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8 - * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr - * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] a_offset Offset to be added to each element of the matrix A - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] c_offset Offset to be added to each element of the matrix C. - * @param[in] c_mult_int Multiplied with each element of the matrix C. - * @param[in] shift Number of bits to shift right the result. - */ -__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0), - IMAGE_DECLARATION(src1), - IMAGE_DECLARATION(dst), - int a_offset, - int b_offset, - int c_offset, - int c_mult_int, - int shift) -{ - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B - - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); - - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); - - // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; - - // Reset accumulators - int16 c00 = 0.0f; - int16 c10 = 0.0f; - int16 c20 = 0.0f; - int16 c30 = 0.0f; - - for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32)) - { - // Load values from matrix A (interleaved) and matrix B (transposed) - int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0)); - int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); - - c00 += (int16)a0.s0 * b0; - c10 += (int16)a0.s1 * b0; - c20 += (int16)a0.s2 * b0; - c30 += (int16)a0.s3 * b0; - - int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16)); - - c00 += (int16)a0.s4 * b1; - c10 += (int16)a0.s5 * b1; - c20 += (int16)a0.s6 * b1; - c30 += (int16)a0.s7 * b1; - } - - for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16)) - { - // Load values from matrix A (interleaved) and matrix B (transposed) - int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0)); - int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); - - c00 += (int16)a0.s0 * b0; - c10 += (int16)a0.s1 * b0; - c20 += (int16)a0.s2 * b0; - c30 += (int16)a0.s3 * b0; - } - - // Compute destination address - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Multiply by the weight of matrix product - c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift; - c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift; - c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift; - c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift; - - // Store 4x16 block - vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0))); - vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1))); - vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2))); - vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3))); -} - /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication * |