aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2020-11-05 09:18:11 +0000
committerSiCong Li <sicong.li@arm.com>2020-11-09 10:10:53 +0000
commit0ea50e3233c0b20a3e3c68e42bdff31565cefa3d (patch)
tree61a0029285a8c85f67a224753584abf2c0da6586
parentbef7fa27b0d231a8649952f60808132d109b6345 (diff)
downloadComputeLibrary-0ea50e3233c0b20a3e3c68e42bdff31565cefa3d.tar.gz
COMPMID-3730: Remove CLGEMMMatrixMultiplyKernel Patch2
Change-Id: I56137938c9ebe1a5aeeaa750b39fcfc6818016f1 Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4332 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/gemm_v1.cl248
-rw-r--r--src/core/CL/cl_kernels/helpers.h62
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp68
3 files changed, 223 insertions, 155 deletions
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/gemm_v1.cl
index 231f81a123..5f8b4f694e 100644
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ b/src/core/CL/cl_kernels/gemm_v1.cl
@@ -24,10 +24,14 @@
#include "gemm_helpers.h"
#include "repeat.h"
-#if defined(K) && defined(H0) && defined(V0)
+#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
*
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
* @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
* @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
@@ -239,19 +243,21 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store 4x4 block
- vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
- vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
- vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
- vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+ const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+ const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
*
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
* @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
* @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
*
@@ -566,16 +572,19 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
#endif // defined(ACTIVATION_TYPE)
// Store 4x4 block
- vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
- vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
- vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
- vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+ const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+ const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
*
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
* @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
* @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
@@ -788,15 +797,18 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store 4x8 block
- vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
- vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
- vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
- vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+ const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+ const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
*
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
* @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
* @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
@@ -1019,15 +1031,18 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store 4x8 block
- vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
- vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
- vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
- vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+ const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+ const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
*
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
* @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
* @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
@@ -1315,17 +1330,16 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
#endif // defined(ACTIVATION_TYPE)
// Store 4x8 block
- vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
- vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
- vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
- vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+ const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+ const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#endif // defined(K) && defined(H0) && defined(V0)
+#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(K) && defined(N0) && (M0)
+#if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
#if defined(DATA_TYPE)
#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, N0)
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
@@ -1333,7 +1347,10 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
* @note This OpenCL kernel works with floating point data types (F16/F32)
* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0
- * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DK and -DALPHA
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
*
@@ -1405,7 +1422,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
// Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * M0;
+ src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
// Update address for the matrix B
src_addr.s1 += idx * sizeof(DATA_TYPE);
@@ -1426,8 +1443,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zin) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zin = min(DEPTH_GEMM3D - 1, zin);
// Add offset due to the cross plane paddings
@@ -1554,11 +1571,10 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
int z = get_global_id(2);
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
// Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0)
+ * dst_stride_y);
uint4 zout = 0;
@@ -1579,8 +1595,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zout) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
// Add offset due to the cross plane paddings
@@ -1616,8 +1632,10 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
ADD_BLOCK_BROADCAST(M0, acc, bias0);
#else // defined(BROADCAST_BIAS)
- __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * src2_stride_y) + get_global_id(
- 2) * src2_stride_z;
+ __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0)
+ * src2_stride_y)
+ + z * src2_stride_z;
LOAD_BLOCK(M0, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
@@ -1636,7 +1654,9 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store output block
- STORE_BLOCK(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
+ const bool cond_y = get_global_id(1) == 0;
+ const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
#endif // defined(DATA_TYPE)
@@ -1644,9 +1664,11 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
*
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * This kernel optimally uses -DN0=4.
- * @note The number of matrix A columns must be passed at compile time using -DK.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note This kernel processed a fixed number of elements along x: -DN0=4.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
*
@@ -1718,7 +1740,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
// Update address for matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * M0;
+ src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
// Update address for matrix B
src_addr.s1 += idx * sizeof(float);
@@ -1739,8 +1761,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zin) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zin = min(DEPTH_GEMM3D - 1, zin);
// Add offset due to the cross plane paddings
@@ -1999,11 +2021,9 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
int z = get_global_id(2);
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
// Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0) * dst_stride_y);
uint4 zout = 0;
@@ -2023,8 +2043,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zout) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
// Add offset due to the cross plane paddings
@@ -2060,8 +2080,10 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
ADD_BLOCK_BROADCAST(M0, acc, bias0);
#else // defined(BROADCAST_BIAS)
- __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)M0 * src2_stride_y) + get_global_id(
- 2) * src2_stride_z;
+ __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0)
+ * src2_stride_y)
+ + z * src2_stride_z;
LOAD_BLOCK(M0, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
@@ -2080,16 +2102,9 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store the output block
- vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if M0 > 1
- vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // M0 > 1
-#if M0 > 2
- vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // M0 > 2
-#if M0 > 3
- vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // M0 > 3
+ const bool cond_y = get_global_id(1) == 0;
+ const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(M0, 4, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
@@ -2097,9 +2112,11 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * This kernel optimally uses -DN0=2.
- * @note The number of matrix A columns must be passed at compile time using -DK.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ * @note This kernel processed a fixed number of elements along x: -DN0=2.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
*
@@ -2172,7 +2189,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
// Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * M0;
+ src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
// Update address for the matrix B
src_addr.s1 += idx * sizeof(float);
@@ -2193,8 +2210,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zin) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zin = min(DEPTH_GEMM3D - 1, zin);
// Add offset due to the cross plane paddings
@@ -2408,11 +2425,9 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
int z = get_global_id(2);
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
// Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0) * dst_stride_y);
uint4 zout = 0;
@@ -2433,8 +2448,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zout) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
// Add offset due to the cross plane paddings
@@ -2470,8 +2485,10 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
ADD_BLOCK_BROADCAST(M0, acc, bias0);
#else // defined(BROADCAST_BIAS)
- __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) * (uint)M0 * src2_stride_y) + get_global_id(
- 2) * src2_stride_z;
+ __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0)
+ * src2_stride_y)
+ + z * src2_stride_z;
LOAD_BLOCK(M0, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
@@ -2490,16 +2507,9 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store the output block
- vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if M0 > 1
- vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // M0 > 1
-#if M0 > 2
- vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // M0 > 2
-#if M0 > 3
- vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // M0 > 3
+ const bool cond_y = get_global_id(1) == 0;
+ const bool cond_x = ((get_global_id(0) + 1) * 2 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(M0, 2, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
@@ -2507,9 +2517,11 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
*
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * This kernel optimally uses -DN0=4.
- * @note The number of matrix A columns must be passed at compile time using -DK.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note This kernel processed a fixed number of elements along x: -DN0=8.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
*
@@ -2581,7 +2593,7 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
// Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * M0;
+ src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
// Update address for the matrix B
src_addr.s1 += idx * sizeof(half);
@@ -2602,8 +2614,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zin) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zin = min(DEPTH_GEMM3D - 1, zin);
// Add offset due to the cross plane paddings
@@ -2764,11 +2776,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
int z = get_global_id(2);
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
// Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
uint4 zout = 0;
@@ -2789,8 +2798,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zout) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
// Add offset due to the cross plane paddings
@@ -2827,8 +2836,10 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
ADD_BLOCK_BROADCAST(M0, acc, bias_f0);
#else // defined(BROADCAST_BIAS)
- __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)M0 * src2_stride_y) + get_global_id(
- 2) * src2_stride_z;
+ __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0)
+ * src2_stride_y)
+ + z * src2_stride_z;
LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
@@ -2869,16 +2880,20 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store the output block
- STORE_BLOCK(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
+ const bool cond_y = get_global_id(1) == 0;
+ const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * This kernel optimally uses -DN0=4.
- * @note The number of matrix A columns must be passed at compile time using -DK.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note This kernel processed a fixed number of elements along x: -DN0=8.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
*
@@ -2950,7 +2965,7 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
// Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * M0;
+ src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
// Update address for the matrix B
src_addr.s1 += idx * sizeof(half);
@@ -2971,8 +2986,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zin) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zin = min(DEPTH_GEMM3D - 1, zin);
// Add offset due to the cross plane paddings
@@ -3133,11 +3148,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
int z = get_global_id(2);
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
// Compute dst address
- __global uchar *dst_addr = offset(&dst, 0, 0);
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
uint4 zout = 0;
@@ -3158,8 +3170,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
// | |
// |__________________|
- // The plane (zout) is calculated dividing M (get_global_id(1) * M0) by HEIGHT_GEMM3D
- zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * M0)) / (uint4)HEIGHT_GEMM3D;
+ // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
zout = min(DEPTH_GEMM3D - 1, zout);
// Add offset due to the cross plane paddings
@@ -3195,8 +3207,10 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
ADD_BLOCK_BROADCAST(M0, acc, bias0);
#else // defined(BROADCAST_BIAS)
- __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)M0 * src2_stride_y) + get_global_id(
- 2) * src2_stride_z;
+ __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+ PARTIAL_STORE_M0)
+ * src2_stride_y)
+ + z * src2_stride_z;
LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
@@ -3215,8 +3229,10 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
#endif // defined(ACTIVATION_TYPE)
// Store the output block
- STORE_BLOCK(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s);
+ const bool cond_y = get_global_id(1) == 0;
+ const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+ STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#endif // defined(K) && defined(N0) && (M0) \ No newline at end of file
+#endif // defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 0bdf16dab1..1f637ade2f 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -282,21 +282,75 @@
// Size == 1 (scalar)
#define vstore_partial_1_0 NO_STORE
#define vstore_partial_1_1 vstore1
+#define vstore_partial_1_2 NO_STORE
+#define vstore_partial_1_3 NO_STORE
+#define vstore_partial_1_4 NO_STORE
+#define vstore_partial_1_5 NO_STORE
+#define vstore_partial_1_6 NO_STORE
+#define vstore_partial_1_7 NO_STORE
+#define vstore_partial_1_8 NO_STORE
+#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_10 NO_STORE
+#define vstore_partial_1_11 NO_STORE
+#define vstore_partial_1_12 NO_STORE
+#define vstore_partial_1_13 NO_STORE
+#define vstore_partial_1_14 NO_STORE
+#define vstore_partial_1_15 NO_STORE
+#define vstore_partial_1_16 NO_STORE
// Size == 2
#define vstore_partial_2_0 NO_STORE
#define vstore_partial_2_1 vstore_partial_1
#define vstore_partial_2_2 vstore_partial_2
+#define vstore_partial_2_3 NO_STORE
+#define vstore_partial_2_4 NO_STORE
+#define vstore_partial_2_5 NO_STORE
+#define vstore_partial_2_6 NO_STORE
+#define vstore_partial_2_7 NO_STORE
+#define vstore_partial_2_8 NO_STORE
+#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_10 NO_STORE
+#define vstore_partial_2_11 NO_STORE
+#define vstore_partial_2_12 NO_STORE
+#define vstore_partial_2_13 NO_STORE
+#define vstore_partial_2_14 NO_STORE
+#define vstore_partial_2_15 NO_STORE
+#define vstore_partial_2_16 NO_STORE
// Size == 3
#define vstore_partial_3_0 NO_STORE
#define vstore_partial_3_1 vstore_partial_1
#define vstore_partial_3_2 vstore_partial_2
#define vstore_partial_3_3 vstore_partial_3
+#define vstore_partial_3_4 NO_STORE
+#define vstore_partial_3_5 NO_STORE
+#define vstore_partial_3_6 NO_STORE
+#define vstore_partial_3_7 NO_STORE
+#define vstore_partial_3_8 NO_STORE
+#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_10 NO_STORE
+#define vstore_partial_3_11 NO_STORE
+#define vstore_partial_3_12 NO_STORE
+#define vstore_partial_3_13 NO_STORE
+#define vstore_partial_3_14 NO_STORE
+#define vstore_partial_3_15 NO_STORE
+#define vstore_partial_3_16 NO_STORE
// Size == 4
#define vstore_partial_4_0 NO_STORE
#define vstore_partial_4_1 vstore_partial_1
#define vstore_partial_4_2 vstore_partial_2
#define vstore_partial_4_3 vstore_partial_3
#define vstore_partial_4_4 vstore_partial_4
+#define vstore_partial_4_5 NO_STORE
+#define vstore_partial_4_6 NO_STORE
+#define vstore_partial_4_7 NO_STORE
+#define vstore_partial_4_8 NO_STORE
+#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_10 NO_STORE
+#define vstore_partial_4_11 NO_STORE
+#define vstore_partial_4_12 NO_STORE
+#define vstore_partial_4_13 NO_STORE
+#define vstore_partial_4_14 NO_STORE
+#define vstore_partial_4_15 NO_STORE
+#define vstore_partial_4_16 NO_STORE
// Size == 8
#define vstore_partial_8_0 NO_STORE
#define vstore_partial_8_1 vstore_partial_1
@@ -307,6 +361,14 @@
#define vstore_partial_8_6 vstore_partial_6
#define vstore_partial_8_7 vstore_partial_7
#define vstore_partial_8_8 vstore_partial_8
+#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_10 NO_STORE
+#define vstore_partial_8_11 NO_STORE
+#define vstore_partial_8_12 NO_STORE
+#define vstore_partial_8_13 NO_STORE
+#define vstore_partial_8_14 NO_STORE
+#define vstore_partial_8_15 NO_STORE
+#define vstore_partial_8_16 NO_STORE
// Size == 16
#define vstore_partial_16_0 NO_STORE
#define vstore_partial_16_1 vstore_partial_1
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index b0d08a756c..2419104fba 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -197,22 +197,7 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
num_elems_processed_per_iteration_y = 4;
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reshape_info.m();
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
- AccessWindowStatic input1_access(input1, 0, 0,
- ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
- ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
if(input2 != nullptr)
{
const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
@@ -223,16 +208,8 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
- window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+ window_changed = update_window_and_padding(win, input2_access); // window used by the execute_window_loop
}
- else
- {
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- }
-
- output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
}
else // The input tensors have not been reshaped
{
@@ -240,11 +217,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
// Create kernels according to the architecture, data type and input size.
GPUTarget arch_target = get_arch_from_target(gpu_target);
if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
@@ -255,22 +227,19 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
// Configure window
win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
+ AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
+ output->dimension(0),
+ output->dimension(1));
if(input2 != nullptr)
{
const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
- const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
AccessWindowStatic input2_access(input2, 0, 0,
ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
- ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
+ input2->dimension(1));
window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
@@ -319,6 +288,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
is_interleaved_transposed, reshape_info, fp_mixed_precision));
+ auto padding_info = is_interleaved_transposed ? get_padding_info({ input0, input1, output }) : get_padding_info({ input0, output });
+
_input0 = input0;
_input1 = input1;
_input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2;
@@ -354,12 +325,22 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
+ // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
+ // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+ // This means that the actual m used by the kernel is given by output->info()->dimension(1)
+ const unsigned int internal_m = _reinterpret_output_as_3d ? output->info()->dimension(1) * output->info()->dimension(2) : output->info()->dimension(1);
+ const unsigned int n = output->info()->dimension(0);
+
const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
const unsigned int m0 = num_elements_processed.y();
const unsigned int n0 = num_elements_processed.x();
+ // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+ const unsigned int partial_store_m0 = internal_m % m0;
+ const unsigned int partial_store_n0 = n % n0;
+
// Create build options
CLBuildOptions build_opts;
@@ -384,9 +365,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+ build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(n));
build_opts.add_option("-DK=" + support::cpp11::to_string(input1->info()->dimension(0) / (n0 * mult_transpose1xW_width)));
build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
if(is_data_type_float(data_type) && is_bifrost)
{
@@ -404,8 +389,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
}
else // The input tensors have not been reshaped
{
+ build_opts.add_option("-DN=" + support::cpp11::to_string(n));
build_opts.add_option("-DK=" + support::cpp11::to_string(input0->info()->dimension(0)));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
// Create kernels according to the architecture, data type and input size.
if(is_data_type_float(data_type) && is_bifrost)
@@ -437,8 +427,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
{
kernel_name = "gemm_mm_floating_point";
}
- build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
}
// Create kernel
@@ -463,6 +451,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
_config_id += support::cpp11::to_string(output->info()->dimension(3));
_config_id += "_";
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,