aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/gemm.cl
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2018-01-12 10:21:40 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:44:21 +0000
commit36a0a4608bf413fc1fd65eb335bfb736ef602149 (patch)
tree2ff0e35dc9e16fedd601b1f24bdc13d25d075b90 /src/core/CL/cl_kernels/gemm.cl
parent46edf63bd630f5e3f3eb31b7d4602caa317da075 (diff)
downloadComputeLibrary-36a0a4608bf413fc1fd65eb335bfb736ef602149.tar.gz
COMPMID-748 - Integrating optimized SGEMM for bifrost
This patch introduces a new GEMM capable to improve the mac utilisation of 10% compared to the GEMM without reshape. However this implementation is not faster in all cases as we need to take into account the time for reshaping the matrices. For this reason an heuristic solution to select the optimal GEMM to use has been added to the function. More information about the heuristic implementation can be found at COMPMID-852. With this new patch, GoogleNet, MobileNet, VGG16 and SqueezeNet can improved the performance of 1.5x. More information about the performance uplift can be found here: https://confluence.arm.com/display/MLENG/GEMM+FP32+performance%3A+ACL+18.02 Change-Id: I024563c06b9aed02a211a974e452bae5c233b04c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117140 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/gemm.cl')
-rw-r--r--src/core/CL/cl_kernels/gemm.cl420
1 files changed, 161 insertions, 259 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index c763cb355b..bad09f3c42 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,41 +27,23 @@
#include "fixed_point.h"
#endif // FIXED_POINT_POSITION
-/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- uint x = get_global_id(0);
- uint y = get_global_id(1);
+#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
- // Compute address for Matrix B - source
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
- // Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+#if TRANSPOSE_W == 4
+#define DATA_TYPE uint
+#elif TRANSPOSE_W == 8
+#define DATA_TYPE ushort
+#elif TRANSPOSE_W == 16
+#define DATA_TYPE uchar
+#else // TRANSPOSE_W == 16
+#error "Transpose width not supported"
+#endif // TRANSPOSE_W
- uint4 b0 = vload4(0, (__global uint *)src.ptr);
-
- vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
+ *
+ * @attention The multiplication factor (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -69,12 +51,12 @@ __kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
+__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
IMAGE_DECLARATION(dst))
{
uint x = get_global_id(0);
@@ -84,16 +66,22 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
Image src = CONVERT_TO_IMAGE_STRUCT(src);
// Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+ uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
+ (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
- ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
+ b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
- vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
+ VSTORE(TRANSPOSE_W)
+ (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
}
+#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
-/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
+#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -106,9 +94,10 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
+__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
IMAGE_DECLARATION(dst))
{
+ // Compute source and destination addresses
uint x = get_global_id(0);
uint y = get_global_id(1);
@@ -116,141 +105,35 @@ __kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
Image src = CONVERT_TO_IMAGE_STRUCT(src);
// Compute address for Matrix B transposed - destination. X and Y are swapped
- uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
-
- uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
-
- vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Load values from Matrix A
- uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
- uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
- uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
- uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
-
- uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
-
- val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
-
- val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
-
- val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
- vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Load values from Matrix A
- ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
- ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
- ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
- ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
-
- ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
-
- val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
-
- val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
-
- val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
- vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
- * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
-{
- // Compute source and destination addresses
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
+ (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
// Load values from Matrix A
- uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
- uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
- uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
- uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
-
- uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
- (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
-
- val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
- (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
-
- val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
- (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
-
- val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
- (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
- vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
+
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
+
+ val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
+ vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
}
+#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-#if defined(COLS_B)
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
@@ -270,30 +153,32 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
-
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
- // Divide by 4 in order to get the src_addr in unit of float
- src_addr = src_addr >> 2;
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
float4 c00 = 0.0f;
@@ -301,11 +186,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
float4 c20 = 0.0f;
float4 c30 = 0.0f;
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+ for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
- float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
c00 += (float4)a0.s0 * b0;
c10 += (float4)a0.s1 * b0;
@@ -313,8 +198,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
c30 += (float4)a0.s3 * b0;
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
- b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
c00 += (float4)a0.s0 * b0;
c10 += (float4)a0.s1 * b0;
@@ -322,11 +207,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
c30 += (float4)a0.s3 * b0;
}
- for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
- float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
c00 += (float4)a0.s0 * b0;
c10 += (float4)a0.s1 * b0;
@@ -371,23 +256,33 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
- __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+ __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
__global float *src_end_addr_b = src_addr_b + COLS_B;
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
// Reset accumulators
float c00 = 0.0f;
float c01 = 0.0f;
@@ -406,7 +301,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
float c32 = 0.0f;
float c33 = 0.0f;
- for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+ for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
{
// Load values from matrix A (interleaved) and matrix B (transposed)
float4 a0 = vload4(0, src_addr_a);
@@ -433,8 +328,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 4);
- b0 = vload4(0, src_addr_b + 4);
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -457,8 +352,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 8);
- b0 = vload4(0, src_addr_b + 8);
+ a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -481,8 +376,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, src_addr_a + 12);
- b0 = vload4(0, src_addr_b + 12);
+ a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
c00 = fma(a0.s0, b0.s0, c00);
c01 = fma(a0.s0, b0.s1, c01);
@@ -505,7 +400,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = fma(a0.s3, b0.s3, c33);
}
- for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+ for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
{
// Load values from matrix A (interleaved) and matrix B (transposed)
float4 a0 = vload4(0, src_addr_a);
@@ -555,8 +450,6 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
c33 = c33 * ALPHA;
#endif // defined(ALPHA)
- barrier(CLK_GLOBAL_MEM_FENCE);
-
// Store 4x4 block
vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
@@ -584,30 +477,32 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
-
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
- // Divide by 2 in order to get the src_addr in unit of half
- src_addr = src_addr >> 1;
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
half8 c00 = 0.0f;
@@ -615,11 +510,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
half8 c20 = 0.0f;
half8 c30 = 0.0f;
- for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
+ for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
- half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
c00 += (half8)a0.s0 * b0;
c10 += (half8)a0.s1 * b0;
@@ -627,8 +522,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
c30 += (half8)a0.s3 * b0;
// Load values from matrix A (interleaved) and matrix B (transposed)
- a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
- b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
c00 += (half8)a0.s0 * b0;
c10 += (half8)a0.s1 * b0;
@@ -636,11 +531,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
c30 += (half8)a0.s3 * b0;
}
- for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
- half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
c00 += (half8)a0.s0 * b0;
c10 += (half8)a0.s1 * b0;
@@ -689,27 +584,32 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes;
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global char *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
short8 c00 = 0.0f;
@@ -722,11 +622,11 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
short8 c31 = 0.0f;
// This for loop performs 1 accumulation for each iteration
- for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
- char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
+ char4 a0 = vload4(0, src_addr_a);
+ char16 b0 = vload16(0, src_addr_b);
c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
@@ -783,30 +683,32 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Divide by 2 in order to get the src_addr in unit of short
- src_addr = src_addr >> 1;
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
+ __global short *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
// Reset accumulators
int8 c00 = 0.0f;
@@ -815,11 +717,11 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
int8 c30 = 0.0f;
// This for loop performs 1 accumulation for each iteration
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
{
/* Load values from matrix A (interleaved) and matrix B (transposed) */
- short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
- short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
+ short4 a0 = vload4(0, src_addr_a);
+ short8 b0 = vload8(0, src_addr_b);
c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
@@ -850,7 +752,7 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
}
#endif // defined(FIXED_POINT_POSITION)
-#endif // defined(COLS_B)
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
#if defined(DATA_TYPE)