From 5fc07aa26faba56b815156fc84d852e80219fe50 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 15 May 2019 17:08:02 +0100 Subject: COMPMID-2338: Remove CLGEMMInterleave4x4 and CLGEMMTranspose1xW Change-Id: I527fc97eac51308de601e5d1d50e75e4d89c5ee5 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/1158 Tested-by: Arm Jenkins Reviewed-by: Giuseppe Rossini Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/gemm.cl | 181 ----------------------------------------- 1 file changed, 181 deletions(-) (limited to 'src/core/CL/cl_kernels/gemm.cl') diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index da45d0fc18..41e5c338b3 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -2205,187 +2205,6 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), } #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) -#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH) - -#if ELEMENT_SIZE == 1 -#define DATA_TYPE uchar -#elif ELEMENT_SIZE == 2 -#define DATA_TYPE ushort -#elif ELEMENT_SIZE == 4 -#define DATA_TYPE uint -#else // ELEMENT_SIZE == 1 -#error "Element size not supported" -#endif // ELEMENT_SIZE - -/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix - * - * @note The transposition width must be passed at compile time using -DTRANSPOSE_W (i.e. -DTRANSPOSE_W) - * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - // Compute address for Matrix B - source - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - - // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y + - (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE); - - // Add offset for batched GEMM - dst_addr_in_bytes += z * dst_stride_z; - - VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W) - b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr); - - VSTORE(TRANSPOSE_W) - (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes)); -} -#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH) - -#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE) - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block. If -DUNROLL_BLOCK is passed at compile time, the 4x4 block - * will be simply unrolled. - * - * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float) - * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) - * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time: - * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D - * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor. - * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor - * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) - */ -__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst) -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D - ) -{ - // Compute source and destination addresses - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - // Compute address for source tensor - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - - // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y + - (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE); - - // Add offset for batched GEMM - dst_addr_in_bytes += z * dst_stride_z; - -#if defined(REINTERPRET_INPUT_AS_3D) - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * 4 * sizeof(DATA_TYPE) + y * 4 * src_stride_y; - - // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension - // in order to take into account the presence of possible cross plane paddings - // - // | | - // | plane0 | - // | | - // |__________________| - // |******************| - // | cross_plane_pad | - // |******************| - // | | - // | plane1 | - // | | - // |__________________| - - // The plane (zin) is calculated dividing M (y * 4) by HEIGHT_GEMM3D - uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(y * 4)) / (uint4)HEIGHT_GEMM3D; - zin = min(DEPTH_GEMM3D - 1, zin); - - // Add offset due to the cross plane paddings - zin *= (cross_plane_pad * src_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply src_stride_z by DEPTH_GEMM3D - input_ptr += z * src_stride_z * DEPTH_GEMM3D; - - // Load values from Matrix A - LOAD_BLOCK(4, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin.s); - -#else // defined(REINTERPRET_INPUT_AS_3D) - __global uchar *input_ptr = src.ptr; - - // Load values from Matrix A - VEC_DATA_TYPE(DATA_TYPE, 4) - a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 4) - a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 4) - a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 4) - a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y)); -#endif // defined(REINTERPRET_INPUT_AS_3D) - -#if defined(UNROLL_BLOCK) - vstore4(a0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT)); - vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT)); - vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT)); - vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT)); -#else // defined(UNROLL_BLOCK) - VEC_DATA_TYPE(DATA_TYPE, 4) - val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0); - vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT)); - - val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1); - vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT)); - - val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2); - vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT)); - - val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3); - vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT)); -#endif // defined(UNROLL_BLOCK) -} -#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE) - #if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication -- cgit v1.2.1