aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2018-11-16 16:04:25 +0000
committerMichele Di Giorgio <michele.digiorgio@arm.com>2019-02-07 09:44:08 +0000
commitebc3a90721fe4a41b8e141466894d4d7185c01b7 (patch)
tree9149764caa37edbdc6bb6c69d503d37dbb28449f /src/core
parent4632e5e44e9a78b15884d0947007bb030fde0aea (diff)
downloadComputeLibrary-ebc3a90721fe4a41b8e141466894d4d7185c01b7.tar.gz
COMPMID-1706: Fuse the bias addition within CLGEMM
Change-Id: I378f2023f4fa010f195f76716ac07aa86279bfae Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/280 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/CL/cl_kernels/gemm.cl296
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp11
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp73
3 files changed, 358 insertions, 22 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 3359a42d0d..4736f80d9b 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1784,6 +1784,8 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -1796,6 +1798,8 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1808,6 +1812,10 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1821,6 +1829,9 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
*/
__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -1910,6 +1921,16 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
c30 = c30 * (float4)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float4 c0 = vload4(0, src2_addr);
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -1959,7 +1980,9 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
}
/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication.
+ *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
*
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
@@ -1974,6 +1997,8 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1986,6 +2011,10 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1999,6 +2028,9 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2223,6 +2255,28 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float4 c0 = vload4(0, src2_addr);
+
+ c00 += c0.s0;
+ c01 += c0.s1;
+ c02 += c0.s2;
+ c03 += c0.s3;
+ c10 += c0.s0;
+ c11 += c0.s1;
+ c12 += c0.s2;
+ c13 += c0.s3;
+ c20 += c0.s0;
+ c21 += c0.s1;
+ c22 += c0.s2;
+ c23 += c0.s3;
+ c30 += c0.s0;
+ c31 += c0.s1;
+ c32 += c0.s2;
+ c33 += c0.s3;
+#endif /* defined(ADD_VEC_C) */
+
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
// in order to take into account the presence of possible cross plane paddings
@@ -2275,6 +2329,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -2287,6 +2343,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2299,6 +2357,10 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2312,6 +2374,9 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
*/
__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2401,6 +2466,20 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
c30 = c30 * (half8)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -2452,6 +2531,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable.
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -2464,6 +2545,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2476,6 +2559,10 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2489,6 +2576,9 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2578,6 +2668,20 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
c30 = c30 * (float8)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float8 c0 = convert_float8(vload8(0, src2_addr));
+ // clang-format on
+ // *INDENT-ON*
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -2629,6 +2733,8 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -2641,6 +2747,8 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2653,6 +2761,10 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2663,6 +2775,9 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -2834,6 +2949,20 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
c30 = c30 * (half8)ALPHA;
#endif // defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ c00 += c0;
+ c10 += c0;
+ c20 += c0;
+ c30 += c0;
+#endif /* defined(ADD_VEC_C) */
+
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
@@ -2892,7 +3021,9 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
#if defined(DATA_TYPE)
#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
*
* @note This OpenCL kernel works with floating point data types (F16/F32)
* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -2908,6 +3039,8 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2920,6 +3053,10 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -2934,6 +3071,9 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
*/
__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -3134,6 +3274,26 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
acc3 = acc3 * (VECTOR_TYPE)ALPHA;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global DATA_TYPE *src2_addr = (__global DATA_TYPE *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ VECTOR_TYPE c0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ acc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
int z = get_global_id(2);
#if defined(REINTERPRET_OUTPUT_AS_3D)
@@ -3204,6 +3364,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -3219,6 +3381,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -3231,6 +3395,10 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -3245,6 +3413,9 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -3599,6 +3770,34 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float4 c0 = vload4(0, src2_addr);
+
+ acc00 += c0.s0;
+ acc01 += c0.s1;
+ acc02 += c0.s2;
+ acc03 += c0.s3;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc10 += c0.s0;
+ acc11 += c0.s1;
+ acc12 += c0.s2;
+ acc13 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc20 += c0.s0;
+ acc21 += c0.s1;
+ acc22 += c0.s2;
+ acc23 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc30 += c0.s0;
+ acc31 += c0.s1;
+ acc32 += c0.s2;
+ acc33 += c0.s3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
// in order to take into account the presence of possible cross plane paddings
@@ -3658,6 +3857,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
@@ -3674,6 +3875,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -3686,6 +3889,10 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -3700,6 +3907,9 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -3986,6 +4196,26 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
// Compute dst address
__global uchar *dst_addr = offset(&dst, 0, 0);
+#if defined(ADD_VEC_C)
+ __global float *src2_addr = (__global float *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ float2 c0 = vload2(0, src2_addr);
+
+ acc00 += c0.s0;
+ acc01 += c0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc10 += c0.s0;
+ acc11 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc20 += c0.s0;
+ acc21 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc30 += c0.s0;
+ acc31 += c0.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
#if defined(REINTERPRET_OUTPUT_AS_3D)
// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
// in order to take into account the presence of possible cross plane paddings
@@ -4046,6 +4276,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -4061,6 +4293,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -4073,6 +4307,10 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -4087,6 +4325,9 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -4327,6 +4568,26 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
#endif // defined(ALPHA)
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ hacc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ hacc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ hacc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ hacc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
int z = get_global_id(2);
// Compute destination address
@@ -4394,6 +4655,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
+ * Moreover, it can add a vector (src2) if the ADD_VEC_C parameter is passed at compile time.
+ *
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
@@ -4409,6 +4672,8 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
*
+ * @note In case a 3rd input (src2) needs to be added, the ADD_VEC_C parameter has to be passed at compile time as -DADD_VEC_C
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -4421,6 +4686,10 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src2_ptr (Optional) Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the source vector in X dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the source matrix
* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -4435,6 +4704,9 @@ __kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
*/
__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
+#if defined(ADD_VEC_C)
+ VECTOR_DECLARATION(src2),
+#endif /* defined(ADD_VEC_C) */
IMAGE_DECLARATION(dst),
uint src0_stride_z,
uint src1_stride_z,
@@ -4659,6 +4931,26 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
acc3 = acc3 * (half8)ALPHA;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 && defined(ALPHA)
+#if defined(ADD_VEC_C)
+ // *INDENT-OFF*
+ // clang-format off
+ __global half *src2_addr = (__global half *)(src2_ptr + src2_offset_first_element_in_bytes + get_global_id(0) * src2_step_x);
+ half8 c0 = vload8(0, src2_addr);
+ // clang-format on
+ // *INDENT-ON*
+
+ acc0 += c0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += c0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif /* defined(ADD_VEC_C) */
+
int z = get_global_id(2);
// Compute destination address
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 825d7fb216..803ed30d84 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,14 +86,13 @@ void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *ou
_input = input;
_output = output;
- std::ostringstream ma_arguments;
- ma_arguments << "-DBETA=" << beta;
- std::set<std::string> build_opts;
- build_opts.emplace(ma_arguments.str());
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(beta));
// Create kernel
std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index b667621426..2b004c23db 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -48,8 +48,8 @@ namespace
{
using ElementsProcessed = Steps;
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
- bool fp_mixed_precision)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
@@ -61,9 +61,20 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool has_vec_c = input2 != nullptr && beta != 0.f;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(has_vec_c && !is_beta_one, "Adding input2 is only supported for beta equal to 1");
+
if(!is_interleaved_transposed)
{
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+ if(has_vec_c)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->dimension(0) != input1->dimension(0), "Length of Vector C must match the number of columns of matrix B");
+ }
}
else
{
@@ -101,6 +112,12 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+ if(has_vec_c)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input2->num_dimensions() > 1, "input2 must be a 1D tensor");
+ }
}
if(output->total_size() != 0)
@@ -113,10 +130,11 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
return Status{};
}
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
- bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+ float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
ElementsProcessed &num_elements_processed)
{
+ ARM_COMPUTE_UNUSED(beta);
bool window_changed = false;
Window win{};
Window win_out{};
@@ -126,6 +144,7 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
+ const bool has_vec_c = input2 != nullptr && beta != 0.f;
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -176,6 +195,11 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+ if(has_vec_c)
+ {
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win, input2_access);
+ }
output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
}
@@ -209,6 +233,11 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
+ if(has_vec_c)
+ {
+ AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_x);
+ window_changed = window_changed || update_window_and_padding(win, input2_access);
+ }
Coordinates coord;
coord.set_num_dimensions(output->num_dimensions());
@@ -227,20 +256,22 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
} // namespace
CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+ : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _has_vec_c(false)
{
}
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
- bool fp_mixed_precision)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
// Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, fp_mixed_precision));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
+ is_interleaved_transposed, reshape_info, fp_mixed_precision));
_input0 = input0;
_input1 = input1;
+ _input2 = input2;
_output = output;
_reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
_reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
@@ -266,7 +297,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
ElementsProcessed num_elements_processed{};
// Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
+ gpu_target, num_elements_processed);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
@@ -288,6 +320,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+ _has_vec_c = input2 != nullptr && beta != 0.f;
+
std::string kernel_name;
if(is_interleaved_transposed)
{
@@ -351,6 +385,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
}
+ // Configure matrix C addition if necessary
+ build_opts.add_option_if(_has_vec_c, "-DADD_VEC_C");
+
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -373,16 +410,18 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
- const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
+ bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
{
// Note: num_elements_processed will be set in validate_and_configure_window()
ElementsProcessed num_elements_processed{};
ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info, fp_mixed_precision));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
input1->clone().get(),
+ (input2 != nullptr) ? input2->clone().get() : nullptr,
output->clone().get(),
+ beta,
is_interleaved_transposed,
reshape_info,
gpu_target,
@@ -409,10 +448,12 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+ const unsigned int num_arguments_vec_c = (_has_vec_c) ? num_arguments_per_1D_tensor() : 0;
+
if(_reinterpret_input_as_3d)
{
// Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_vec_c;
const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
}
@@ -420,7 +461,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
if(_reinterpret_output_as_3d)
{
// Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_vec_c;
const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
}
@@ -438,6 +479,10 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
unsigned int idx = 0;
add_2D_tensor_argument(idx, _input0, slice);
add_2D_tensor_argument(idx, _input1, slice_b);
+ if(_has_vec_c)
+ {
+ add_1D_tensor_argument(idx, _input2, slice);
+ }
add_2D_tensor_argument(idx, _output, slice);
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));