diff options
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 8 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/gemm.cl | 420 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp | 59 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 75 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp | 37 |
5 files changed, 289 insertions, 310 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 6695881d09..ae3553860a 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -211,9 +211,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, { "gemm_accumulate_biases", "gemm.cl" }, - { "gemm_interleave4x4_8bit", "gemm.cl" }, - { "gemm_interleave4x4_16bit", "gemm.cl" }, - { "gemm_interleave4x4_32bit", "gemm.cl" }, + { "gemm_interleave4x4", "gemm.cl" }, { "gemm_ma_f16", "gemm.cl" }, { "gemm_ma_f32", "gemm.cl" }, { "gemm_ma_qs8", "gemm.cl" }, @@ -230,9 +228,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "gemm_mm_qs8", "gemm.cl" }, { "gemm_mm_qs16", "gemm.cl" }, { "gemm_lc_vm_f32", "gemm.cl" }, - { "gemm_transpose1x16", "gemm.cl" }, - { "gemm_transpose1x8", "gemm.cl" }, - { "gemm_transpose1x4", "gemm.cl" }, + { "gemm_transpose1xW", "gemm.cl" }, { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" }, { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" }, { "gemmlowp_mm_bifrost", "gemmlowp.cl" }, diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index c763cb355b..bad09f3c42 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,41 +27,23 @@ #include "fixed_point.h" #endif // FIXED_POINT_POSITION -/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - uint x = get_global_id(0); - uint y = get_global_id(1); +#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH) - // Compute address for Matrix B - source - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); +#if TRANSPOSE_W == 4 +#define DATA_TYPE uint +#elif TRANSPOSE_W == 8 +#define DATA_TYPE ushort +#elif TRANSPOSE_W == 16 +#define DATA_TYPE uchar +#else // TRANSPOSE_W == 16 +#error "Transpose width not supported" +#endif // TRANSPOSE_W - uint4 b0 = vload4(0, (__global uint *)src.ptr); - - vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes)); -} - -/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix +/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix + * + * @attention The multiplication factor (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16 + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) @@ -69,12 +51,12 @@ __kernel void gemm_transpose1x4(IMAGE_DECLARATION(src), * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ -__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src), +__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src), IMAGE_DECLARATION(dst)) { uint x = get_global_id(0); @@ -84,16 +66,22 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src), Image src = CONVERT_TO_IMAGE_STRUCT(src); // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); + uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y + + (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE); - ushort8 b0 = vload8(0, (__global ushort *)src.ptr); + VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W) + b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr); - vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes)); + VSTORE(TRANSPOSE_W) + (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes)); } +#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH) -/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix +#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE) + +/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8 + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) @@ -106,9 +94,10 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src), * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ -__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src), +__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src), IMAGE_DECLARATION(dst)) { + // Compute source and destination addresses uint x = get_global_id(0); uint y = get_global_id(1); @@ -116,141 +105,35 @@ __kernel void gemm_transpose1x16(IMAGE_DECLARATION(src), Image src = CONVERT_TO_IMAGE_STRUCT(src); // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); - - uchar16 b0 = vload16(0, (__global uchar *)src.ptr); - - vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes)); -} - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Compute source and destination addresses - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values from Matrix A - uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0))); - uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1))); - uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2))); - uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3))); - - uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 0); - - val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 4); - - val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 8); - - val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 12); -} - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Compute source and destination addresses - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values from Matrix A - ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0))); - ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1))); - ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2))); - ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3))); - - ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0); - - val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8); - - val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16); - - val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24); -} - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Compute source and destination addresses - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y + + (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE); // Load values from Matrix A - uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0))); - uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1))); - uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2))); - uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3))); - - uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1), - (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0); - - val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5), - (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16); - - val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9), - (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32); - - val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD), - (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48); + VEC_DATA_TYPE(DATA_TYPE, 4) + a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0))); + VEC_DATA_TYPE(DATA_TYPE, 4) + a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1))); + VEC_DATA_TYPE(DATA_TYPE, 4) + a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2))); + VEC_DATA_TYPE(DATA_TYPE, 4) + a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3))); + + VEC_DATA_TYPE(DATA_TYPE, 4) + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT)); + + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT)); + + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT)); + + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT)); } +#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE) -#if defined(COLS_B) +#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication * @@ -270,30 +153,32 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B - - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; - // Divide by 4 in order to get the src_addr in unit of float - src_addr = src_addr >> 2; + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global float *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators float4 c00 = 0.0f; @@ -301,11 +186,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) float4 c20 = 0.0f; float4 c30 = 0.0f; - for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8)) + for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0); - float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1); + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); c00 += (float4)a0.s0 * b0; c10 += (float4)a0.s1 * b0; @@ -313,8 +198,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) c30 += (float4)a0.s3 * b0; // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4); - b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4); + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH); c00 += (float4)a0.s0 * b0; c10 += (float4)a0.s1 * b0; @@ -322,11 +207,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) c30 += (float4)a0.s3 * b0; } - for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0); - float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1); + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); c00 += (float4)a0.s0 * b0; c10 += (float4)a0.s1 * b0; @@ -371,23 +256,33 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; + // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes); - __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes); + __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B __global float *src_end_addr_b = src_addr_b + COLS_B; + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + // Reset accumulators float c00 = 0.0f; float c01 = 0.0f; @@ -406,7 +301,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) float c32 = 0.0f; float c33 = 0.0f; - for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16) + for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH)) { // Load values from matrix A (interleaved) and matrix B (transposed) float4 a0 = vload4(0, src_addr_a); @@ -433,8 +328,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, src_addr_a + 4); - b0 = vload4(0, src_addr_b + 4); + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH); c00 = fma(a0.s0, b0.s0, c00); c01 = fma(a0.s0, b0.s1, c01); @@ -457,8 +352,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, src_addr_a + 8); - b0 = vload4(0, src_addr_b + 8); + a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH); c00 = fma(a0.s0, b0.s0, c00); c01 = fma(a0.s0, b0.s1, c01); @@ -481,8 +376,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, src_addr_a + 12); - b0 = vload4(0, src_addr_b + 12); + a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH); c00 = fma(a0.s0, b0.s0, c00); c01 = fma(a0.s0, b0.s1, c01); @@ -505,7 +400,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); } - for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4) + for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH)) { // Load values from matrix A (interleaved) and matrix B (transposed) float4 a0 = vload4(0, src_addr_a); @@ -555,8 +450,6 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = c33 * ALPHA; #endif // defined(ALPHA) - barrier(CLK_GLOBAL_MEM_FENCE); - // Store 4x4 block vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0))); vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1))); @@ -584,30 +477,32 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B - - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; - // Divide by 2 in order to get the src_addr in unit of half - src_addr = src_addr >> 1; + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators half8 c00 = 0.0f; @@ -615,11 +510,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), half8 c20 = 0.0f; half8 c30 = 0.0f; - for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16)) + for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0); - half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1); + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); c00 += (half8)a0.s0 * b0; c10 += (half8)a0.s1 * b0; @@ -627,8 +522,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), c30 += (half8)a0.s3 * b0; // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4); - b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8); + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH); c00 += (half8)a0.s0 * b0; c10 += (half8)a0.s1 * b0; @@ -636,11 +531,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), c30 += (half8)a0.s3 * b0; } - for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0); - half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1); + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); c00 += (half8)a0.s0 * b0; c10 += (half8)a0.s1 * b0; @@ -689,27 +584,32 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes; + __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes; // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global char *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators short8 c00 = 0.0f; @@ -722,11 +622,11 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), short8 c31 = 0.0f; // This for loop performs 1 accumulation for each iteration - for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0); - char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1); + char4 a0 = vload4(0, src_addr_a); + char16 b0 = vload16(0, src_addr_b); c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION); c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION); @@ -783,30 +683,32 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); - - // Divide by 2 in order to get the src_addr in unit of short - src_addr = src_addr >> 1; + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global short *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators int8 c00 = 0.0f; @@ -815,11 +717,11 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), int8 c30 = 0.0f; // This for loop performs 1 accumulation for each iteration - for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) { /* Load values from matrix A (interleaved) and matrix B (transposed) */ - short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0); - short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1); + short4 a0 = vload4(0, src_addr_a); + short8 b0 = vload8(0, src_addr_b); c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION); c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION); @@ -850,7 +752,7 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3))); } #endif // defined(FIXED_POINT_POSITION) -#endif // defined(COLS_B) +#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y) #if defined(DATA_TYPE) diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp index 6886f54602..241dd8549d 100644 --- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,17 +40,16 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height) { + ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } @@ -58,11 +57,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int mult_interleave4x4_height) { - unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_x = 4; constexpr unsigned int num_elems_processed_per_iteration_y = 4; - const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y * mult_interleave4x4_height; bool window_changed = false; // Configure kernel window @@ -73,7 +72,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen // Configure window in case of configured output if(output->total_size() != 0) { - AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height); + const float scale_y = 1.0f / (scale_x); + + AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y); window_changed = window_changed || update_window_and_padding(win, output_access); output_access.set_valid_region(win, input->valid_region()); } @@ -88,25 +90,42 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() { } -void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output) +void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info(), mult_interleave4x4_height))); // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_interleave4x4_height)); _input = input; _output = output; + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); + switch(input->info()->element_size()) + { + case 1: + build_opts.add_option("-DDATA_TYPE=uchar"); + break; + case 2: + build_opts.add_option("-DDATA_TYPE=ushort"); + break; + case 4: + build_opts.add_option("-DDATA_TYPE=uint"); + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + } + // Create kernel - std::string kernel_name = "gemm_interleave4x4_" + support::cpp11::to_string(input->info()->element_size() * 8) + "bit"; - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts.options())); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); + auto win_config = validate_and_configure_window(input->info(), output->info(), mult_interleave4x4_height); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure(win_config.second); @@ -119,10 +138,10 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _config_id += support::cpp11::to_string(output->info()->dimension(1)); } -Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_interleave4x4_height)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mult_interleave4x4_height).first); return Status{}; } @@ -144,10 +163,6 @@ void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queu Window in_slice = window.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); - // Change x and y steps for the slide of output tensor - out_slice.scale(Window::DimX, 4.f); - out_slice.scale(Window::DimY, 0.25f); - do { unsigned int idx = 0; diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 19f38bf5a5..e23feb269a 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,24 +36,68 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include <set> #include <string> using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; namespace { using ElementsProcessed = Steps; -inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1); + if(!is_interleaved_transposed) { ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); + } + } + else + { + const int m = reshape_info.m(); + const int n = reshape_info.n(); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + TensorShape tensor_shape0{ input0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ input1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n)); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); + } } return Status{}; @@ -122,12 +166,19 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() { } -void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed) +void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + // Output tensor auto inizialitation if not yet initialized + TensorShape tensor_shape{ input0->info()->tensor_shape() }; + tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0)); + tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1)); + + auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape)); + // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info)); _input0 = input0; _input1 = input1; @@ -176,7 +227,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen std::string kernel_name; if(is_interleaved_transposed) { + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width)); + build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); + if(data_type == DataType::F32) { kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target); @@ -230,11 +287,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } -Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target) +Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, + const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target) { + // Note: num_elements_processed will be set in validate_and_configure_window() ElementsProcessed num_elements_processed{}; ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get(), diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 69a545b76b..63aed6df32 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,8 +42,9 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width) { + ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); @@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), - compute_transpose1xW_with_element_size_shape(*input)); + compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } @@ -59,11 +60,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width) { num_elems_processed_per_iteration = 16 / input->element_size(); - const int scale_x = num_elems_processed_per_iteration; + const int scale_x = num_elems_processed_per_iteration * mult_transpose1xW_width; bool window_changed = false; // Configure kernel window @@ -90,25 +91,31 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info(), mult_transpose1xW_width))); // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width)); _input = input; _output = output; // Configure kernel window + // Note: num_elems_processed_per_iteration will be set in validate_and_configure_window() unsigned int num_elems_processed_per_iteration = 1; - auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration); + auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure(win_config.second); + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width)); + /* * Following an example of how the transposition1xW works when the input data type is F32 * @@ -117,18 +124,18 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | * |a30 a31 a32 a33| * - * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width */ // Create kernel - std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration); - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + std::string kernel_name = "gemm_transpose1xW"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } -Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width) { unsigned int num_elems_processed_per_iteration = 1; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first); return Status{}; } |