aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/gemm.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/gemm.cl')
-rw-r--r--src/core/CL/cl_kernels/gemm.cl104
1 files changed, 0 insertions, 104 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 15111ed352..c763cb355b 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -251,110 +251,6 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
}
#if defined(COLS_B)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
- *
- * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] a_offset Offset to be added to each element of the matrix A
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] c_offset Offset to be added to each element of the matrix C.
- * @param[in] c_mult_int Multiplied with each element of the matrix C.
- * @param[in] shift Number of bits to shift right the result.
- */
-__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- int a_offset,
- int b_offset,
- int c_offset,
- int c_mult_int,
- int shift)
-{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
-
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
-
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
-
- // Reset accumulators
- int16 c00 = 0.0f;
- int16 c10 = 0.0f;
- int16 c20 = 0.0f;
- int16 c30 = 0.0f;
-
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
- {
- // Load values from matrix A (interleaved) and matrix B (transposed)
- int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
- int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
- c00 += (int16)a0.s0 * b0;
- c10 += (int16)a0.s1 * b0;
- c20 += (int16)a0.s2 * b0;
- c30 += (int16)a0.s3 * b0;
-
- int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
-
- c00 += (int16)a0.s4 * b1;
- c10 += (int16)a0.s5 * b1;
- c20 += (int16)a0.s6 * b1;
- c30 += (int16)a0.s7 * b1;
- }
-
- for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
- {
- // Load values from matrix A (interleaved) and matrix B (transposed)
- int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
- int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
- c00 += (int16)a0.s0 * b0;
- c10 += (int16)a0.s1 * b0;
- c20 += (int16)a0.s2 * b0;
- c30 += (int16)a0.s3 * b0;
- }
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Multiply by the weight of matrix product
- c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
- c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
- c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
- c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
-
- // Store 4x16 block
- vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
- vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
- vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
- vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
-}
-
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*