aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2020-04-03 12:40:10 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2020-04-08 08:43:39 +0000
commitf64d33619827ce6ec9af4566c4743834e521328e (patch)
tree2ca123ae92c2f91ecbccbacc77687bc359973fd4 /src/core/CL/cl_kernels
parent8abbabd6ad946441c8ef1a03896fa98f7801af1f (diff)
downloadComputeLibrary-f64d33619827ce6ec9af4566c4743834e521328e.tar.gz
COMPMID-3236: Extend CLGEMMLowpReduction kernels to multiply by a scalar value
Change-Id: Iebd6afac65d10a42d60c2c9df9e1895fadb205ae Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2981 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl26
1 files changed, 21 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 71de1d4b27..b707ec8175 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1287,6 +1287,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
#if defined(COLS_A)
/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
*
* @note This stage is needed to handle the offset of matrix product
* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
@@ -1294,8 +1295,9 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
* @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
* @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1342,11 +1344,15 @@ __kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+#if defined(SCALAR)
+ sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
*((__global int *)dst.ptr) = (int)sum_row;
}
#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
*
* @note This stage is needed to handle the offset of matrix product
* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
@@ -1354,8 +1360,9 @@ __kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
* @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
* @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1408,6 +1415,9 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
sum_row += (ACC_DATA_TYPE)matrix_a[i];
}
+#if defined(SCALAR)
+ sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
*((__global int *)dst.ptr) = (int)sum_row;
}
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -1415,6 +1425,7 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
#if defined(COLS_B) && defined(ROWS_B)
/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
*
* @note This stage is needed to handle the offset of matrix product
* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
@@ -1422,8 +1433,9 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
* @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
* @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1480,7 +1492,11 @@ __kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
matrix_b += src_stride_y;
}
- vstore16(convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+#if defined(SCALAR)
+ sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR;
+#endif // defined(SCALAR)
+ VSTORE(16)
+ (sum_col_32, 0, (__global int *)dst.ptr);
}
#endif // defined(COLS_B) && defined(ROWS_B)