aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/gemm.cl
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2019-05-15 17:08:02 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2019-05-20 16:53:52 +0000
commit5fc07aa26faba56b815156fc84d852e80219fe50 (patch)
tree84d32d9157e2adc6371acd6b38bd66da4b523818 /src/core/CL/cl_kernels/gemm.cl
parentb9626ab169a168a7c1ca57edd1996e1e80938bf1 (diff)
downloadComputeLibrary-5fc07aa26faba56b815156fc84d852e80219fe50.tar.gz
COMPMID-2338: Remove CLGEMMInterleave4x4 and CLGEMMTranspose1xW
Change-Id: I527fc97eac51308de601e5d1d50e75e4d89c5ee5 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/1158 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/gemm.cl')
-rw-r--r--src/core/CL/cl_kernels/gemm.cl181
1 files changed, 0 insertions, 181 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index da45d0fc18..41e5c338b3 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -2205,187 +2205,6 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
}
#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
-#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
-
-#if ELEMENT_SIZE == 1
-#define DATA_TYPE uchar
-#elif ELEMENT_SIZE == 2
-#define DATA_TYPE ushort
-#elif ELEMENT_SIZE == 4
-#define DATA_TYPE uint
-#else // ELEMENT_SIZE == 1
-#error "Element size not supported"
-#endif // ELEMENT_SIZE
-
-/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
- *
- * @note The transposition width must be passed at compile time using -DTRANSPOSE_W (i.e. -DTRANSPOSE_W)
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- // Compute address for Matrix B - source
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
- // Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
- (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
-
- // Add offset for batched GEMM
- dst_addr_in_bytes += z * dst_stride_z;
-
- VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
- b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
-
- VSTORE(TRANSPOSE_W)
- (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
-}
-#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
-
-#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block. If -DUNROLL_BLOCK is passed at compile time, the 4x4 block
- * will be simply unrolled.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:
- * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
- )
-{
- // Compute source and destination addresses
- uint x = get_global_id(0);
- uint y = get_global_id(1);
- uint z = get_global_id(2);
-
- // Compute address for source tensor
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
- // Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
- (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
-
- // Add offset for batched GEMM
- dst_addr_in_bytes += z * dst_stride_z;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * 4 * sizeof(DATA_TYPE) + y * 4 * src_stride_y;
-
- // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible cross plane paddings
- //
- // | |
- // | plane0 |
- // | |
- // |__________________|
- // |******************|
- // | cross_plane_pad |
- // |******************|
- // | |
- // | plane1 |
- // | |
- // |__________________|
-
- // The plane (zin) is calculated dividing M (y * 4) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(y * 4)) / (uint4)HEIGHT_GEMM3D;
- zin = min(DEPTH_GEMM3D - 1, zin);
-
- // Add offset due to the cross plane paddings
- zin *= (cross_plane_pad * src_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply src_stride_z by DEPTH_GEMM3D
- input_ptr += z * src_stride_z * DEPTH_GEMM3D;
-
- // Load values from Matrix A
- LOAD_BLOCK(4, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin.s);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
- __global uchar *input_ptr = src.ptr;
-
- // Load values from Matrix A
- VEC_DATA_TYPE(DATA_TYPE, 4)
- a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 4)
- a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 4)
- a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 4)
- a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(UNROLL_BLOCK)
- vstore4(a0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
- vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
- vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
- vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
-#else // defined(UNROLL_BLOCK)
- VEC_DATA_TYPE(DATA_TYPE, 4)
- val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
- vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
-
- val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
- vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
-
- val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
- vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
-
- val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
- vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
-#endif // defined(UNROLL_BLOCK)
-}
-#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-
#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication