From 4bfc70e31766587c951204c93a127a486e007d0c Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Fri, 10 Dec 2021 16:17:56 +0000 Subject: Add Gemm MMUL Reshaped Only Rhs Support for FP32/FP16 This patch introduces a GEMM routine that is optimized for Arm(R) Mali(TM)-G715 and Arm(R) Mali(TM)-G615 Resolves: COMPMID-5216 Signed-off-by: Gunes Bayir Change-Id: I2e5d7806f5904347185bb3e250f73d73d6669dba Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7914 Reviewed-by: SiCong Li Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/CL/CLHelpers.cpp | 4 + .../common/gemm_reshaped_only_rhs_mmul.cl | 528 +++++++++++++++++++++ src/core/CL/cl_kernels/tile_helpers.h | 55 ++- 3 files changed, 577 insertions(+), 10 deletions(-) create mode 100644 src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl (limited to 'src/core/CL') diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 5172a7730a..94675d60cc 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -491,4 +491,8 @@ void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list 0 + * - N0 = 1, 2, 3, 4, 8, 16 + * - K0 = 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. + * The activation function is performed after the bias addition + * + * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 + * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) + * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) + * @param[in] lhs_w The size of the width dimension of the LHS tensor + * @param[in] lhs_h The size of the height dimension of the LHS tensor + * @param[in] lhs_n The size of the depth dimension of the LHS tensor + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor + * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) + * @param[in] rhs_w The size of the width dimension of the RHS tensor + * @param[in] rhs_h The size of the height dimension of the RHS tensor + * @param[in] rhs_n The size of the depth dimension of the RHS tensor + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the depth dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] M Number of rows in LHS matrix not reshaped + * @param[in] N Number of columns in RHS matrix not reshaped + * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped + */ +__kernel void gemm_mm_reshaped_only_rhs_nt_mmul( + TENSOR3D_T(lhs, BUFFER), + TENSOR3D_T(rhs, BUFFER), +#if defined(BETA) + TENSOR3D_T(bia, BUFFER), +#endif // defined(BETA) + TENSOR3D_T(dst, BUFFER), + const int M, + const int N, + const int K) +{ +#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0) + + uint x0 = get_global_id(0); // (N / N0) * MMUL_K0 + uint y0 = get_global_id(1); // (M / M0) / MMUL_M0 + uint z = get_global_id(2); // Batch + + // Get block ID and thread ID within the block + uint block_id = (x0 / MMUL_BLOCK_SIZE); + uint thread_id = (x0 % MMUL_BLOCK_SIZE); + + // Coordinate within a block + uint block_x = thread_id % MMUL_N0; + uint block_y = (thread_id / MMUL_M0); + + // Starting destination coordinates + uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1)); + uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0)); + + // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication + // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results + + // Starting LHS coordinates + uint lhs_x = block_x; + uint lhs_y = dst_y; + + // Starting RHS coordinates + uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0; + uint rhs_y = block_id; + + // Compute LHS/RHS/DST matrix address + lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z; + rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z; + dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z; + + // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for + // not sliding the tensor + + // Initialize the accumulators + // MMUL extension accumulate the result in F32 for both F32 and F16 + TILE(float, M0, N0, c_f32); + +#if !defined(HALF_PRECISION) +#define c c_f32 +#endif // !defined(HALF_PRECISION) + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c_f32[i].v = 0; + }) + + for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0) + { + TILE(DATA_TYPE, M0, 1, a); + TILE(DATA_TYPE, 1, N0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, 0, b); + + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]); + }) + }) + + lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE); + rhs_offset_first_element_in_bytes += MMUL_K0 * MMUL_N0 * N0 * sizeof(DATA_TYPE); + } + + if(block_x * N0 + block_id * MMUL_N0 * N0 >= N) + { + return; + } + + if(block_y * M0 + y0 * M0 * MMUL_M0 >= M) + { + return; + } + +#if defined(HALF_PRECISION) + TILE(DATA_TYPE, M0, N0, c); + + // Conversion required for the half precision + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + c[m0].s[n0] = c_f32[m0].s[n0]; + }) + }) +#endif // defined(HALF_PRECISION) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE); + + TILE(DATA_TYPE, 1, N0, bias0); + + if(dst_x + N0 <= N || N0_LEFTOVER == 0) + { + bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); + } + else + { + VLOAD_PARTIAL(N0, N0_LEFTOVER) + (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); + } + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c); +#else // defined(BROADCAST_BIAS) + TILE(DATA_TYPE, M0, N0, bias0); + + bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z; + + if(dst_x + N0 <= N || N0_LEFTOVER == 0) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); + } + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + VLOAD_PARTIAL(N0, N0_LEFTOVER) + (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); + } + }) + } + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias + T_ADD(DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); + + // Store + if(dst_x + N0 <= N || N0_LEFTOVER == 0) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + VSTORE(N0) + (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); + } + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + VSTORE_PARTIAL(N0, N0_LEFTOVER) + (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); + } + }) + } + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL) + +#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL_TEXTURE) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension and the OpenCL image for RHS: + * + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed + * + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2) + * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2) + * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 > 0 + * - N0 = 1, 2, 3, 4, 8, 16 + * - K0 = 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. + * The activation function is performed after the bias addition + * + * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 + * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) + * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) + * @param[in] lhs_w The size of the width dimension of the LHS tensor + * @param[in] lhs_h The size of the height dimension of the LHS tensor + * @param[in] lhs_n The size of the depth dimension of the LHS tensor + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor + * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) + * @param[in] rhs_w The size of the width dimension of the RHS tensor + * @param[in] rhs_h The size of the height dimension of the RHS tensor + * @param[in] rhs_n The size of the depth dimension of the RHS tensor + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the depth dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] M Number of rows in LHS matrix not reshaped + * @param[in] N Number of columns in RHS matrix not reshaped + * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped + */ +__kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture( + TENSOR3D_T(lhs, BUFFER), + TENSOR3D_T(rhs, IMAGE), +#if defined(BETA) + TENSOR3D_T(bia, BUFFER), +#endif // defined(BETA) + TENSOR3D_T(dst, BUFFER), + const int M, + const int N, + const int K) +{ +#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0) + + uint x0 = get_global_id(0); // (N / N0) * MMUL_K0 + uint y0 = get_global_id(1); // (M / M0) / MMUL_M0 + uint z = get_global_id(2); // Batch + + // Get block ID and thread ID within the block + uint block_id = (x0 / MMUL_BLOCK_SIZE); + uint thread_id = (x0 % MMUL_BLOCK_SIZE); + + // Coordinate within a block + uint block_x = thread_id % MMUL_N0; + uint block_y = (thread_id / MMUL_M0); + + // Starting destination coordinates + uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1)); + uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0)); + + // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication + // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results + + // Starting LHS coordinates + uint lhs_x = block_x; + uint lhs_y = dst_y; + + // Starting RHS coordinates + uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0; + uint rhs_y = block_id + z * rhs_h; + + // Compute LHS/RHS/DST matrix address + lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z; + dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z; + + // Initialize the accumulators + // MMUL extension accumulate the result in F32 for both F32 and F16 + TILE(float, M0, N0, c_f32); + +#if !defined(HALF_PRECISION) +#define c c_f32 +#endif // !defined(HALF_PRECISION) + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c_f32[i].v = 0; + }) + + for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0) + { + TILE(DATA_TYPE, M0, 1, a); + TILE(DATA_TYPE, 1, N0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD(DATA_TYPE, 1, N0, IMAGE, rhs, rhs_x, rhs_y, 1, rhs_stride_y, b); + + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]); + }) + }) + + lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE); + rhs_x += MMUL_K0 * MMUL_N0 * N0; + } + + if(block_x * N0 + block_id * MMUL_N0 * N0 >= N) + { + return; + } + + if(block_y * M0 + y0 * M0 * MMUL_M0 >= M) + { + return; + } + +#if defined(HALF_PRECISION) + TILE(DATA_TYPE, M0, N0, c); + + // Conversion required for the half precision + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + c[m0].s[n0] = c_f32[m0].s[n0]; + }) + }) +#endif // defined(HALF_PRECISION) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE); + + TILE(DATA_TYPE, 1, N0, bias0); + + if(dst_x + N0 <= N || N0_LEFTOVER == 0) + { + bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); + } + else + { + VLOAD_PARTIAL(N0, N0_LEFTOVER) + (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes)); + } + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c); +#else // defined(BROADCAST_BIAS) + TILE(DATA_TYPE, M0, N0, bias0); + + bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z; + + if(dst_x + N0 <= N || N0_LEFTOVER == 0) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); + } + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + VLOAD_PARTIAL(N0, N0_LEFTOVER) + (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y)); + } + }) + } + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias + T_ADD(DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); + + // Store + if(dst_x + N0 <= N || N0_LEFTOVER == 0) + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + VSTORE(N0) + (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); + } + }) + } + else + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + if(dst_y + m0 < M || M0_LEFTOVER == 0) + { + VSTORE_PARTIAL(N0, N0_LEFTOVER) + (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y)); + } + }) + } + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE) \ No newline at end of file diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 0ce343e3ec..4b6144a22d 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -970,8 +970,8 @@ #define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) -#define T_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) -#define T_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) +#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) +#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) /** Element-wise activation for quantized types * @@ -995,6 +995,25 @@ }) \ }) +/** Element-wise addition between two tiles + * + * @note Performs: LHS + RHS = DST + * + * @param[in] DATA_TYPE LHS/RHS/DST data type + * @param[in] M0 Number of LHS rows + * @param[in] N0 Number of LHS columns + * @param[in] lhs LHS tile + * @param[in] rhs Constant RHS tile + * @param[out] dst DST tile + */ +#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ + ({ \ + LOOP_UNROLLING(int, _m0, 0, 1, M0, \ + { \ + dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ + }) \ + }) + /** Element-wise addition with a constant value * * @note Performs: LHS + constant = DST @@ -1010,15 +1029,31 @@ ({ \ LOOP_UNROLLING(int, _m0, 0, 1, M0, \ { \ - LOOP_UNROLLING(int, _n0, 0, 1, N0, \ - { \ - dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \ - }) \ + dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ }) \ }) -#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) -#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) + +/** Element-wise scale with a constant value + * + * @note Performs: LHS * constant = DST + * + * @param[in] DATA_TYPE LHS/RHS/DST data type + * @param[in] M0 Number of LHS rows + * @param[in] N0 Number of LHS columns + * @param[in] lhs LHS tile + * @param[in] rhs_constant Constant value + * @param[out] dst DST tile + */ +#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ + ({ \ + LOOP_UNROLLING(int, _m0, 0, 1, M0, \ + { \ + dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ + }) \ + }) /** Element-wise operation with RHS broadcasted (RHS has the X dimension only) * @@ -1041,8 +1076,8 @@ }) \ }) -#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) -#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) /** Element-wise operation between two tiles (LHS and RHS) * -- cgit v1.2.1