From 10e88a73518932abc427e6b12d0267d5f52ac569 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 29 Nov 2021 12:49:19 +0000 Subject: Rework gemm_mm_reshaped_only_rhs_ kernels with new macros - Rework gemm_reshaped_rhs_only with new TILE macros - Fuse post ops in gemm_reshaped_rhs_only Resolves COMPMID-4890 Change-Id: I944948ecec6d08deaf3545b80cd3eeac26e44205 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6944 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Sheri Zhang --- Android.bp | 2 +- SConscript | 2 +- src/core/CL/CLUtils.cpp | 3 +- .../gemm_mm_reshaped_only_rhs.cl | 1399 ------------------ src/core/CL/cl_kernels/common/gemm.cl | 1508 +------------------- .../CL/cl_kernels/common/gemm_reshaped_rhs_only.cl | 953 +++++++++++++ src/core/CL/cl_kernels/tile_helpers.h | 188 ++- src/gpu/cl/ClKernelLibrary.cpp | 22 +- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp | 124 +- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h | 3 +- src/gpu/cl/operators/ClGemm.cpp | 5 +- .../CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp | 73 +- tests/validation/fixtures/GEMMFixture.h | 3 +- 13 files changed, 1199 insertions(+), 3086 deletions(-) delete mode 100644 src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl create mode 100644 src/core/CL/cl_kernels/common/gemm_reshaped_rhs_only.cl diff --git a/Android.bp b/Android.bp index 136714b260..5c48a70251 100644 --- a/Android.bp +++ b/Android.bp @@ -30,7 +30,6 @@ opencl_srcs = [ "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl", "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl", - "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl", "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h", "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "src/core/CL/cl_kernels/common/fft.cl", @@ -40,6 +39,7 @@ opencl_srcs = [ "src/core/CL/cl_kernels/common/floor.cl", "src/core/CL/cl_kernels/common/gather.cl", "src/core/CL/cl_kernels/common/gemm.cl", + "src/core/CL/cl_kernels/common/gemm_reshaped_rhs_only.cl", "src/core/CL/cl_kernels/common/gemm_utils.cl", "src/core/CL/cl_kernels/common/gemmlowp.cl", "src/core/CL/cl_kernels/common/gemv.cl", diff --git a/SConscript b/SConscript index c48a850215..320538e2e5 100644 --- a/SConscript +++ b/SConscript @@ -346,9 +346,9 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/common/gather.cl', 'src/core/CL/cl_kernels/common/gemm.cl', 'src/core/CL/cl_kernels/common/gemm_utils.cl', + 'src/core/CL/cl_kernels/common/gemm_reshaped_rhs_only.cl', 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl', 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl', - 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl', 'src/core/CL/cl_kernels/common/gemv.cl', 'src/core/CL/cl_kernels/common/gemmlowp.cl', 'src/core/CL/cl_kernels/common/generate_proposals.cl', diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp index 5de9a55686..34ffbb7c6c 100644 --- a/src/core/CL/CLUtils.cpp +++ b/src/core/CL/CLUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -127,6 +127,7 @@ void PostOpCLKernelUtils::set_post_ops_cl_build_options(CLBuildOptions &build_op { const auto &post_op = post_ops.get_list().at(post_op_id); const auto slot_prefix = "-DP" + support::cpp11::to_string(slots[post_op_id]); + build_opts.add_option("-DPOST_OP" + support::cpp11::to_string(slots[post_op_id])); if(post_op->type() == experimental::PostOpType::Activation) { const auto _post_op = utils::cast::polymorphic_downcast *>(post_op.get()); diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl deleted file mode 100644 index 7f4ad814fb..0000000000 --- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl +++ /dev/null @@ -1,1399 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "fp_post_ops_act_eltwise_op_act.h" -#include "gemm_helpers.h" -#include "repeat.h" - -/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */ -#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) -#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH) - -#define CONCAT(a, b) a##b - -#define ARM_DOT1(a, b, c) \ - ({ \ - c = fma(a, b, c); \ - }) -#define ARM_DOT2(a, b, c) \ - ({ \ - c = fma(a.s0, b.s0, c); \ - c = fma(a.s1, b.s1, c); \ - }) -#define ARM_DOT3(a, b, c) \ - ({ \ - ARM_DOT2(a, b, c); \ - c = fma((a.s2), (b.s2), c); \ - }) -#define ARM_DOT4(a, b, c) \ - ({ \ - ARM_DOT3(a, b, c); \ - c = fma((a.s3), (b.s3), c); \ - }) -#define ARM_DOT8(a, b, c) \ - ({ \ - ARM_DOT4((a.lo), (b.lo), c); \ - ARM_DOT4((a.hi), (b.hi), c); \ - }) -#define ARM_DOT16(a, b, c) \ - ({ \ - ARM_DOT8((a.lo), (b.lo), c); \ - ARM_DOT8((a.hi), (b.hi), c); \ - }) - -#if N0 == 2 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - }) -#elif N0 == 3 // N0 == 3 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - }) -#elif N0 == 4 // N0 == 4 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##3), (c.s3)); \ - }) -#elif N0 == 8 // N0 == 8 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##3), (c.s3)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##4), (c.s4)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##5), (c.s5)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##6), (c.s6)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##7), (c.s7)); \ - }) -#elif N0 == 16 // N0 == 16 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##3), (c.s3)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##4), (c.s4)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##5), (c.s5)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##6), (c.s6)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##7), (c.s7)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##8), (c.s8)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##9), (c.s9)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##A), (c.sA)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##B), (c.sB)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##C), (c.sC)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##D), (c.sD)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##E), (c.sE)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##F), (c.sF)); \ - }) -#else // N0 not supported -#error "N0 value not supported" -#endif // N0 conditions - -#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: - * Post op 1: activation (optional) - * Post op 2: elementwise op - * Post op 3: activation (optional) - * - * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform - * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * - * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions: - * - * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 - * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) - * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) - * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) - */ -__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), - IMAGE_DECLARATION(rhs), -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - // Post Op arguments - IMAGE_DECLARATION(eltwise_operand), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z, - uint eltwise_operand_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Block size -#define RHS_BLOCK_SIZE ((K0) * (N0)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (K0) -#define RHS_STEP_X ((K0) * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X (K0) -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - - // Compute RHS reshaped matrix address - uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; -#else // defined(MATRIX_B_DEPTH) - rhs_offset += z * rhs_stride_z; -#endif // defined(MATRIX_B_DEPTH) - - REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - -#if defined(REINTERPRET_INPUT_AS_3D) - // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Supported cases (M0, K0): - // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 - // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 - // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 - // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 - // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 - // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 - // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 - // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); - - // Load values from RHS reshaped matrix - LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); - - // Accumulate - ARM_DOT_K0XN0(K0, a0, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(K0, a1, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(K0, a2, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(K0, a3, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(K0, a4, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(K0, a5, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(K0, a6, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(K0, a7, b, c7); -#endif // M0 > 7 - - lhs_offset += K0 * sizeof(DATA_TYPE); - rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); - } - - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); - - // Load values from RHS reshaped matrix - LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); - - // Accumulate - ARM_DOT_K0XN0(1, a0, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(1, a1, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(1, a2, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(1, a3, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(1, a4, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(1, a5, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(1, a6, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(1, a7, b, c7); -#endif // M0 > 7 - - lhs_offset += sizeof(DATA_TYPE); - rhs_offset += sizeof(DATA_TYPE); - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - - // c = act(c) - POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) - POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x); - // c = act(c) - POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -} -#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT) - -#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object. - * Post op 1: activation (optional) - * Post op 2: elementwise op - * Post op 3: activation (optional) - * - * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform - * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * - * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions: - * - * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 - * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) - * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) - * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), - __read_only image2d_t rhs_img, -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - // Post Op arguments - IMAGE_DECLARATION(eltwise_operand), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z, - uint eltwise_operand_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Pixel unit -#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) - - const uint LEFTOVER_K = K % K0; - - // Block size -#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (PIXEL_UNIT) -#define RHS_STEP_X (PIXEL_UNIT * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X PIXEL_UNIT -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); -#else // defined(MATRIX_B_DEPTH) - const uint z_rhs = get_global_id(2); -#endif // defined(MATRIX_B_DEPTH) - - // Compute RHS matrix coordinates - uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; - const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; - - REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - -#if defined(REINTERPRET_INPUT_AS_3D) - // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); - - // Load values from RHS matrix stored in a cl_image - REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); - LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); - - // Accumulate - ARM_DOT_K0XN0(K0, a0, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(K0, a1, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(K0, a2, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(K0, a3, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(K0, a4, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(K0, a5, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(K0, a6, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(K0, a7, b, c7); -#endif // M0 > 7 - - lhs_offset += K0 * sizeof(DATA_TYPE); - x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; - } - - if(LEFTOVER_K != 0) - { - // Note: We cannot read out-of-bound elements from the RHS matrix because - // the RHS width is always multiple of K0. This is not be true for the LHS matrix - - union UNION_VEC_TYPE - { - DATA_TYPE s[K0]; - VEC_DATA_TYPE(DATA_TYPE, K0) - v; - }; - - union UNION_VEC_TYPE a0 = {.v = 0 }; -#if M0 > 1 - union UNION_VEC_TYPE a1 = {.v = 0 }; -#endif // M0 > 1 -#if M0 > 2 - union UNION_VEC_TYPE a2 = {.v = 0 }; -#endif // M0 > 2 -#if M0 > 3 - union UNION_VEC_TYPE a3 = {.v = 0 }; -#endif // M0 > 3 -#if M0 > 4 - union UNION_VEC_TYPE a4 = {.v = 0 }; -#endif // M0 > 4 -#if M0 > 5 - union UNION_VEC_TYPE a5 = {.v = 0 }; -#endif // M0 > 5 -#if M0 > 6 - union UNION_VEC_TYPE a6 = {.v = 0 }; -#endif // M0 > 6 -#if M0 > 7 - union UNION_VEC_TYPE a7 = {.v = 0 }; -#endif // M0 > 7 - - REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); - - // Load from RHS matrix - LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); - - // Load from LHS matrix - for(int k = 0; k < LEFTOVER_K; ++k) - { - a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0); -#if M0 > 1 - a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1); -#endif // M0 > 1 -#if M0 > 2 - a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2); -#endif // M0 > 2 -#if M0 > 3 - a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3); -#endif // M0 > 3 -#if M0 > 4 - a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4); -#endif // M0 > 4 -#if M0 > 5 - a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5); -#endif // M0 > 5 -#if M0 > 6 - a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6); -#endif // M0 > 6 -#if M0 > 7 - a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7); -#endif // M0 > 7 - - lhs_offset += sizeof(DATA_TYPE); - } - - // Accumulate - ARM_DOT_K0XN0(K0, a0.v, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(K0, a1.v, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(K0, a2.v, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(K0, a3.v, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(K0, a4.v, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(K0, a5.v, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(K0, a6.v, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(K0, a7.v, b, c7); -#endif // M0 > 7 - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - - // c = act(c) - POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) - POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x); - // c = act(c) - POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef PIXEL_UNIT -} -#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT) - -#define VFMA(a, b, c) \ - ({ \ - c = fma(a, b, c); \ - }) - -#if M0 == 1 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - }) -#elif M0 == 2 // M0 == 2 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - }) -#elif M0 == 3 // M0 == 3 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - }) -#elif M0 == 4 // M0 == 4 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - }) -#elif M0 == 5 // M0 == 5 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - }) -#elif M0 == 6 // M0 == 6 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - }) -#elif M0 == 7 // M0 == 7 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - }) -#elif M0 == 8 // M0 == 8 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ - }) -#else // M0 not supported -#error "M0 not supported" -#endif // M0 not supported - -#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: - * Post op 1: activation (optional) - * Post op 2: elementwise op - * Post op 3: activation (optional) - * - * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform - * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * - * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions: - * - * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 - * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) - * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) - * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), - IMAGE_DECLARATION(rhs), -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - // Post Op arguments - IMAGE_DECLARATION(eltwise_operand), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z, - uint eltwise_operand_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Block size -#define RHS_BLOCK_SIZE ((K0) * (N0)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (N0) -#define RHS_STEP_X ((N0) * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X (N0) -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - - // Compute RHS reshaped matrix address - uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; -#else // defined(MATRIX_B_DEPTH) - rhs_offset += z * rhs_stride_z; -#endif // defined(MATRIX_B_DEPTH) - - REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0; - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0; - -#if defined(REINTERPRET_INPUT_AS_3D) - - // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Supported cases (M0, K0): - // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 - // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 - // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 - // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 - // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 - // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 - // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 - // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(0, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(1, a, b0, c); -#if K0 > 2 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(2, a, b0, c); -#endif // K0 > 2 -#if K0 > 3 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(3, a, b0, c); -#endif // K0 > 3 -#if K0 > 4 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(4, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(5, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(6, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(7, a, b0, c); -#endif // K0 > 4 -#if K0 > 8 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(8, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(9, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(A, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(B, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(C, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(D, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(E, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(F, a, b0, c); -#endif // K0 > 8 - - lhs_offset += K0 * sizeof(DATA_TYPE); - rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); - } - - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - VEC_DATA_TYPE(DATA_TYPE, 2) - a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, 2) - a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, 2) - a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, 2) - a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, 2) - a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, 2) - a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, 2) - a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, 2) - a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); -#endif // M0 > 7 - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(0, a, b0, c); - - lhs_offset += sizeof(DATA_TYPE); - rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE); - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - - // c = act(c) - POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) - POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x); - // c = act(c) - POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef RHS_STEP_LOOP -} -#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT) - -#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object. - * Post op 1: activation (optional) - * Post op 2: elementwise op - * Post op 3: activation (optional) - * - * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform - * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2 - * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 - * - * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions: - * - * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32 - * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes) - * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes) - * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs), - __read_only image2d_t rhs_img, -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - // Post Op arguments - IMAGE_DECLARATION(eltwise_operand), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z, - uint eltwise_operand_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Pixel unit -#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) - - // Block size -#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (PIXEL_UNIT) -#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X (PIXEL_UNIT) -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - const uint z_rhs = (z % MATRIX_B_DEPTH); -#else // defined(MATRIX_B_DEPTH) - const uint z_rhs = z; -#endif // defined(MATRIX_B_DEPTH) - - // Compute RHS matrix coordinates - uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; - const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; - - REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - -#if defined(REINTERPRET_INPUT_AS_3D) - - // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(0, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(1, a, b0, c); -#if K0 > 2 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(2, a, b0, c); -#endif // K0 > 2 -#if K0 > 3 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(3, a, b0, c); -#endif // K0 > 3 -#if K0 > 4 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(4, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(5, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(6, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(7, a, b0, c); -#endif // K0 > 4 -#if K0 > 8 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(8, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(9, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(A, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(B, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(C, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(D, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(E, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(F, a, b0, c); -#endif // K0 > 8 - - lhs_offset += K0 * sizeof(DATA_TYPE); - x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP; - } - - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - VEC_DATA_TYPE(DATA_TYPE, 2) - a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, 2) - a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, 2) - a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, 2) - a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, 2) - a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, 2) - a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, 2) - a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, 2) - a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); -#endif // M0 > 7 - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); - - VFMA_M0xN0(0, a, b0, c); - - lhs_offset += sizeof(DATA_TYPE); - x_rhs += RHS_STEP_X; - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - - // c = act(c) - POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - // c = c + eltwise_operand (mix-precision, broadcast, boundary aware) - POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, 1, PARTIAL_STORE_N0, false, cond_x); - // c = act(c) - POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c); - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef RHS_STEP_LOOP -} -#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT) -#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH) -#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl index cc7392d728..74e2e5097e 100644 --- a/src/core/CL/cl_kernels/common/gemm.cl +++ b/src/core/CL/cl_kernels/common/gemm.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,1512 +24,6 @@ #include "gemm_helpers.h" #include "repeat.h" -#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) - -#define CONCAT(a, b) a##b - -#define ARM_DOT1(a, b, c) \ - ({ \ - c = fma(a, b, c); \ - }) -#define ARM_DOT2(a, b, c) \ - ({ \ - c = fma(a.s0, b.s0, c); \ - c = fma(a.s1, b.s1, c); \ - }) -#define ARM_DOT3(a, b, c) \ - ({ \ - ARM_DOT2(a, b, c); \ - c = fma((a.s2), (b.s2), c); \ - }) -#define ARM_DOT4(a, b, c) \ - ({ \ - ARM_DOT3(a, b, c); \ - c = fma((a.s3), (b.s3), c); \ - }) -#define ARM_DOT8(a, b, c) \ - ({ \ - ARM_DOT4((a.lo), (b.lo), c); \ - ARM_DOT4((a.hi), (b.hi), c); \ - }) -#define ARM_DOT16(a, b, c) \ - ({ \ - ARM_DOT8((a.lo), (b.lo), c); \ - ARM_DOT8((a.hi), (b.hi), c); \ - }) - -#if N0 == 2 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - }) -#elif N0 == 3 // N0 == 3 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - }) -#elif N0 == 4 // N0 == 4 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##3), (c.s3)); \ - }) -#elif N0 == 8 // N0 == 8 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##3), (c.s3)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##4), (c.s4)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##5), (c.s5)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##6), (c.s6)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##7), (c.s7)); \ - }) -#elif N0 == 16 // N0 == 16 -#define ARM_DOT_K0XN0(k0, a, b, c) \ - ({ \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##0), (c.s0)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##1), (c.s1)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##2), (c.s2)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##3), (c.s3)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##4), (c.s4)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##5), (c.s5)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##6), (c.s6)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##7), (c.s7)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##8), (c.s8)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##9), (c.s9)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##A), (c.sA)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##B), (c.sB)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##C), (c.sC)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##D), (c.sD)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##E), (c.sE)); \ - CONCAT(ARM_DOT, k0) \ - ((a), (b##F), (c.sF)); \ - }) -#else // N0 not supported -#error "N0 value not supported" -#endif // N0 conditions - -#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices. - * The LHS matrix is NOT reshaped - * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed - * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl - * - * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. - * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters. - * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). - * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) - * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) - * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. - * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) - * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) - * @note Only the following configurations of M0, N0 and K0 are currently supported: - * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 - * - N0 = 2, 3, 4, 8, 16 - * - K0 = 2, 3, 4, 8, 16 - * - H0 >= 1 - * - * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. - * The activation function is performed after the bias addition - * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time: - * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D - * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D - * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. - * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor - * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix - * - * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32 - * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) - * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) - * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix - * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr - * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes) - * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes) - * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix - * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr - * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes) - * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes) - * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) - * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) - * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) - * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), - IMAGE_DECLARATION(rhs), -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Block size -#define RHS_BLOCK_SIZE ((K0) * (N0)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (K0) -#define RHS_STEP_X ((K0) * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X (K0) -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - - // Compute RHS reshaped matrix address - uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; -#else // defined(MATRIX_B_DEPTH) - rhs_offset += z * rhs_stride_z; -#endif // defined(MATRIX_B_DEPTH) - - REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - -#if defined(REINTERPRET_INPUT_AS_3D) - // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Supported cases (M0, K0): - // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 - // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 - // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 - // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 - // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 - // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 - // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 - // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); - - // Load values from RHS reshaped matrix - LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); - - // Accumulate - ARM_DOT_K0XN0(K0, a0, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(K0, a1, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(K0, a2, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(K0, a3, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(K0, a4, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(K0, a5, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(K0, a6, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(K0, a7, b, c7); -#endif // M0 > 7 - - lhs_offset += K0 * sizeof(DATA_TYPE); - rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); - } - - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); - - // Load values from RHS reshaped matrix - LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); - - // Accumulate - ARM_DOT_K0XN0(1, a0, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(1, a1, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(1, a2, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(1, a3, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(1, a4, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(1, a5, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(1, a6, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(1, a7, b, c7); -#endif // M0 > 7 - - lhs_offset += sizeof(DATA_TYPE); - rhs_offset += sizeof(DATA_TYPE); - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - -#if defined(ACTIVATION_TYPE) - ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL); -#endif // defined(ACTIVATION_TYPE) - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef RHS_STEP_LOOP -} -#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T) - -#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image - * The LHS matrix is NOT reshaped - * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed - * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl - * - * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel - * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. - * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters. - * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT= (e.g. -DRHS_HEIGHT=32) - * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT - * could be different from the value returned by get_image_height(rhs_img). - * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). - * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) - * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) - * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. - * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) - * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) - * @note Only the following configurations of M0, N0 and K0 are currently supported: - * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 - * - N0 = 4, 8, 16 - * - K0 = 4, 8, 16 - * - H0 >= 1 - * - * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. - * The activation function is performed after the bias addition - * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time: - * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D - * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D - * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. - * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor - * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix - * - * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32 - * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) - * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) - * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix - * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr - * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr - * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes) - * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes) - * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) - * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) - * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) - * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), - __read_only image2d_t rhs_img, -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Pixel unit -#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) - - const uint LEFTOVER_K = K % K0; - - // Block size -#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (PIXEL_UNIT) -#define RHS_STEP_X (PIXEL_UNIT * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X PIXEL_UNIT -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); -#else // defined(MATRIX_B_DEPTH) - const uint z_rhs = get_global_id(2); -#endif // defined(MATRIX_B_DEPTH) - - // Compute RHS matrix coordinates - uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; - const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; - - REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - -#if defined(REINTERPRET_INPUT_AS_3D) - // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); - - // Load values from RHS matrix stored in a cl_image - REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); - LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); - - // Accumulate - ARM_DOT_K0XN0(K0, a0, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(K0, a1, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(K0, a2, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(K0, a3, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(K0, a4, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(K0, a5, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(K0, a6, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(K0, a7, b, c7); -#endif // M0 > 7 - - lhs_offset += K0 * sizeof(DATA_TYPE); - x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; - } - - if(LEFTOVER_K != 0) - { - // Note: We cannot read out-of-bound elements from the RHS matrix because - // the RHS width is always multiple of K0. This is not be true for the LHS matrix - // Left-over accumulations for LHS matrix - - union UNION_VEC_TYPE - { - DATA_TYPE s[K0]; - VEC_DATA_TYPE(DATA_TYPE, K0) - v; - }; - - union UNION_VEC_TYPE a0 = {.v = 0 }; -#if M0 > 1 - union UNION_VEC_TYPE a1 = {.v = 0 }; -#endif // M0 > 1 -#if M0 > 2 - union UNION_VEC_TYPE a2 = {.v = 0 }; -#endif // M0 > 2 -#if M0 > 3 - union UNION_VEC_TYPE a3 = {.v = 0 }; -#endif // M0 > 3 -#if M0 > 4 - union UNION_VEC_TYPE a4 = {.v = 0 }; -#endif // M0 > 4 -#if M0 > 5 - union UNION_VEC_TYPE a5 = {.v = 0 }; -#endif // M0 > 5 -#if M0 > 6 - union UNION_VEC_TYPE a6 = {.v = 0 }; -#endif // M0 > 6 -#if M0 > 7 - union UNION_VEC_TYPE a7 = {.v = 0 }; -#endif // M0 > 7 - - REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); - - // Load from RHS matrix - LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); - - // Load from LHS matrix - for(int k = 0; k < LEFTOVER_K; ++k) - { - a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0); -#if M0 > 1 - a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1); -#endif // M0 > 1 -#if M0 > 2 - a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2); -#endif // M0 > 2 -#if M0 > 3 - a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3); -#endif // M0 > 3 -#if M0 > 4 - a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4); -#endif // M0 > 4 -#if M0 > 5 - a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5); -#endif // M0 > 5 -#if M0 > 6 - a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6); -#endif // M0 > 6 -#if M0 > 7 - a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7); -#endif // M0 > 7 - - lhs_offset += sizeof(DATA_TYPE); - } - - // Accumulate - ARM_DOT_K0XN0(K0, a0.v, b, c0); -#if M0 > 1 - ARM_DOT_K0XN0(K0, a1.v, b, c1); -#endif // M0 > 1 -#if M0 > 2 - ARM_DOT_K0XN0(K0, a2.v, b, c2); -#endif // M0 > 2 -#if M0 > 3 - ARM_DOT_K0XN0(K0, a3.v, b, c3); -#endif // M0 > 3 -#if M0 > 4 - ARM_DOT_K0XN0(K0, a4.v, b, c4); -#endif // M0 > 4 -#if M0 > 5 - ARM_DOT_K0XN0(K0, a5.v, b, c5); -#endif // M0 > 5 -#if M0 > 6 - ARM_DOT_K0XN0(K0, a6.v, b, c6); -#endif // M0 > 6 -#if M0 > 7 - ARM_DOT_K0XN0(K0, a7.v, b, c7); -#endif // M0 > 7 - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - -#if defined(ACTIVATION_TYPE) - ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL); -#endif // defined(ACTIVATION_TYPE) - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef RHS_STEP_LOOP -#undef PIXEL_UNIT -} -#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE) - -#define VFMA(a, b, c) \ - ({ \ - c = fma(a, b, c); \ - }) - -#if M0 == 1 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - }) -#elif M0 == 2 // M0 == 2 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - }) -#elif M0 == 3 // M0 == 3 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - }) -#elif M0 == 4 // M0 == 4 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - }) -#elif M0 == 5 // M0 == 5 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - }) -#elif M0 == 6 // M0 == 6 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - }) -#elif M0 == 7 // M0 == 7 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - }) -#elif M0 == 8 // M0 == 8 -#define VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ - }) -#else // M0 not supported -#error "M0 not supported" -#endif // M0 not supported - -#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices. - * The LHS matrix is NOT reshaped - * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed - * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl - * - * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. - * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters. - * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). - * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) - * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) - * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. - * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) - * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) - * @note Only the following configurations of M0, N0 and K0 are currently supported: - * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 - * - N0 = 2, 3, 4, 8, 16 - * - K0 = 2, 3, 4, 8, 16 - * - H0 >= 1 - * - * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. - * The activation function is performed after the bias addition - * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time: - * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D - * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D - * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. - * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor - * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix - * - * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32 - * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) - * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) - * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix - * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr - * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes) - * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes) - * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix - * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr - * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes) - * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes) - * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) - * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) - * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) - * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), - IMAGE_DECLARATION(rhs), -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Block size -#define RHS_BLOCK_SIZE ((K0) * (N0)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (N0) -#define RHS_STEP_X ((N0) * (H0)) -#define RHS_STEP_LOOP (1) -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X (N0) -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - - // Compute RHS reshaped matrix address - uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; -#else // defined(MATRIX_B_DEPTH) - rhs_offset += z * rhs_stride_z; -#endif // defined(MATRIX_B_DEPTH) - - REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0; - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0; - -#if defined(REINTERPRET_INPUT_AS_3D) - - // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Supported cases (M0, K0): - // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 - // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 - // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 - // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 - // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 - // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 - // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 - // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(0, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(1, a, b0, c); -#if K0 > 2 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(2, a, b0, c); -#endif // K0 > 2 -#if K0 > 3 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(3, a, b0, c); -#endif // K0 > 3 -#if K0 > 4 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(4, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(5, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(6, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(7, a, b0, c); -#endif // K0 > 4 -#if K0 > 8 - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(8, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(9, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(A, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(B, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(C, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(D, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(E, a, b0, c); - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(F, a, b0, c); -#endif // K0 > 8 - - lhs_offset += K0 * sizeof(DATA_TYPE); - rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); - } - - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - VEC_DATA_TYPE(DATA_TYPE, 2) - a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, 2) - a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, 2) - a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, 2) - a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, 2) - a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, 2) - a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, 2) - a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, 2) - a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); -#endif // M0 > 7 - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - - b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); - VFMA_M0xN0(0, a, b0, c); - - lhs_offset += sizeof(DATA_TYPE); - rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE); - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - -#if defined(ACTIVATION_TYPE) - ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL); -#endif // defined(ACTIVATION_TYPE) - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef RHS_STEP_LOOP -} -#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT) - -#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE) -/** This OpenCL kernel computes the matrix multiplication between 2 matrices. - * The LHS matrix is NOT reshaped - * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed - * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl - * - * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel - * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. - * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters. - * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT= (e.g. -DRHS_HEIGHT=32) - * Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT - * could be different from the value returned by get_image_height(rhs_img). - * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). - * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) - * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) - * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. - * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) - * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) - * @note Only the following configurations of M0, N0 and K0 are currently supported: - * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 - * - N0 = 4, 8, 16 - * - K0 = 4, 8, 16 - * - H0 >= 1 - * - * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. - * The activation function is performed after the bias addition - * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time: - * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D - * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D - * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. - * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor - * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix - * - * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32 - * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) - * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) - * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix - * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr - * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr - * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes) - * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes) - * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) - * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) - * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) - * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) - * @param[in] M Number of rows in LHS matrix not reshaped. - * @param[in] N Number of columns in RHS matrix not reshaped. - * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped. - */ -__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), - __read_only image2d_t rhs_img, -#if defined(BETA) - IMAGE_DECLARATION(bias), -#endif // defined(BETA) - IMAGE_DECLARATION(dst), - uint lhs_stride_z, - uint rhs_stride_z, -#if defined(BETA) - uint bias_stride_z, -#endif //defined(BETA) - uint dst_stride_z -#if defined(REINTERPRET_INPUT_AS_3D) - , - uint lhs_cross_plane_pad -#endif // REINTERPRET_INPUT_AS_3D -#if defined(REINTERPRET_OUTPUT_AS_3D) - , - uint dst_cross_plane_pad -#endif // REINTERPRET_OUTPUT_AS_3D - , - const int M, - const int N, - const int K) -{ - // Pixel unit -#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) - - // Block size -#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) - - // RHS offset and step X -#if defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (PIXEL_UNIT) -#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) -#define RHS_STEP_LOOP 1 -#else // defined(RHS_INTERLEAVE) -#define RHS_OFFSET_X (RHS_BLOCK_SIZE) -#define RHS_STEP_X (PIXEL_UNIT) -#define RHS_STEP_LOOP (H0) -#endif // defined(RHS_INTERLEAVE) - - uint x = get_global_id(0); - uint y = get_global_id(1); - uint z = get_global_id(2); - - const bool cond_y = y == 0; - const bool cond_x = ((x + 1) * N0 >= N); - -#if defined(DUMMY_WORK_ITEMS) - if((x * N0 >= N) || (y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; - -#if defined(MATRIX_B_DEPTH) - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - const uint z_rhs = (z % MATRIX_B_DEPTH); -#else // defined(MATRIX_B_DEPTH) - const uint z_rhs = z; -#endif // defined(MATRIX_B_DEPTH) - - // Compute RHS matrix coordinates - uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; - const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; - - REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); - REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - -#if defined(REINTERPRET_INPUT_AS_3D) - - // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_INPUT_AS_3D) - - // Add offset for batched GEMM - lhs_offset += z * lhs_stride_z; - -#endif // defined(REINTERPRET_INPUT_AS_3D) - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); - - int i = 0; - for(; i <= (K - K0); i += K0) - { - // Load values from LHS matrix - LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(0, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(1, a, b0, c); -#if K0 > 2 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(2, a, b0, c); -#endif // K0 > 2 -#if K0 > 3 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(3, a, b0, c); -#endif // K0 > 3 -#if K0 > 4 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(4, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(5, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(6, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(7, a, b0, c); -#endif // K0 > 4 -#if K0 > 8 - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(8, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(9, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(A, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(B, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(C, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(D, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(E, a, b0, c); - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); - VFMA_M0xN0(F, a, b0, c); -#endif // K0 > 8 - - lhs_offset += K0 * sizeof(DATA_TYPE); - x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP; - } - - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - VEC_DATA_TYPE(DATA_TYPE, 2) - a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, 2) - a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, 2) - a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, 2) - a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, 2) - a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, 2) - a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, 2) - a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, 2) - a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); -#endif // M0 > 7 - - VEC_DATA_TYPE(DATA_TYPE, N0) - b0; - b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); - - VFMA_M0xN0(0, a, b0, c); - - lhs_offset += sizeof(DATA_TYPE); - x_rhs += RHS_STEP_X; - } - - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); - - REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; - -#if defined(REINTERPRET_OUTPUT_AS_3D) - // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += z * dst_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += z * dst_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - // Multiply by the weight of matrix-matrix product and store the result -#if defined(ALPHA) - SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); -#endif // defined(ALPHA) - - // Add beta*bias -#if defined(BETA) -#if defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(1, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, c, bias0); - -#else // defined(BROADCAST_BIAS) - __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; - - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#ifndef UNIT_BETA - SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); -#endif // UNIT_BIAS - - // c = c + bias - ADD_BLOCK(M0, c, bias); - -#endif // defined(BROADCAST_BIAS) -#endif // defined(BETA) - -#if defined(ACTIVATION_TYPE) - ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL); -#endif // defined(ACTIVATION_TYPE) - - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); - -#undef RHS_BLOCK_SIZE -#undef RHS_OFFSET_X -#undef RHS_STEP_X -#undef RHS_STEP_LOOP -} -#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE) -#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) - #if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) #if defined(MIXED_PRECISION) diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_rhs_only.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_rhs_only.cl new file mode 100644 index 0000000000..1d6560a1c2 --- /dev/null +++ b/src/core/CL/cl_kernels/common/gemm_reshaped_rhs_only.cl @@ -0,0 +1,953 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "activation_float_helpers.h" +#include "helpers.h" +#include "tile_helpers.h" + +// *INDENT-OFF* +// clang-format off +#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T) +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case of post ops, the following information must be passed at compile time: + * @note -DPOST_OP1, -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 1 + * @note -DPOST_OP2: The arithmetic addition post op to perform at slot 2 + * @note -DPOST_OP3, -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 + * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) + * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) + * @param[in] lhs_w The size of the width dimension of the LHS tensor + * @param[in] lhs_h The size of the height dimension of the LHS tensor + * @param[in] lhs_n The size of the depth dimension of the LHS tensor + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor + * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) + * @param[in] rhs_w The size of the width dimension of the RHS tensor + * @param[in] rhs_h The size of the height dimension of the RHS tensor + * @param[in] rhs_n The size of the depth dimension of the RHS tensor + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] ex0_ptr (Optional) Pointer to the tensor added with POST_OP2. Supported data type: same as @p lhs_ptr + * @param[in] ex0_stride_y (Optional) Stride of the tensor added with POST_OP2 in Y dimension (in bytes) + * @param[in] ex0_stride_z (Optional) Stride of the tensor added with POST_OP2 in Z dimension (in bytes) + * @param[in] ex0_w (Optional) The size of the width dimension of the tensor added with POST_OP2 + * @param[in] ex0_h (Optional) The size of the height dimension of the tensor added with POST_OP2 + * @param[in] ex0_n (Optional) The size of the depth dimension of the tensor added with POST_OP2 + * @param[in] ex0_offset_first_element_in_bytes (Optional) The offset of the first element in the tensor added with POST_OP2 + * @param[out] dst_ptr (Optional) Pointer to the destination tensor. Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_w (Optional) The size of the width dimension of the destination tensor + * @param[in] dst_h (Optional) The size of the height dimension of the destination tensor + * @param[in] dst_n (Optional) The size of the depth dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor + * @param[in] M Number of rows in LHS matrix not reshaped + * @param[in] N Number of columns in RHS matrix not reshaped + * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped + */ +//! @endcond +__kernel void gemm_mm_reshaped_only_rhs_t( + TENSOR3D_T(lhs, BUFFER), + TENSOR3D_T(rhs, BUFFER), +#if defined(BETA) + TENSOR3D_T(bia, BUFFER), +#endif // defined(BETA) +#if defined(POST_OP2) + TENSOR3D_T(ex0, BUFFER), +#endif // defined(POST_OP_ADD) + TENSOR3D_T(dst, BUFFER), + const int M, + const int N, + const int K +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = GET_SPATIAL_IDX(0, N0, 0); + const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); + const uint z = GET_SPATIAL_IDX(2, 1, 0); + +#if defined(DUMMY_WORK_ITEMS) + if((x >= N) || (y >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + bool x_cond = PARTIAL_STORE_N0 != 0 && ((x + N0) > N); + bool y_cond = PARTIAL_STORE_M0 != 0 && y == 0; + + TILE(uint, M0, 1, dst_indirect_y); + INITIALIZE_INDIRECT_Y(M0, PARTIAL_STORE_M0, y_cond, dst_indirect_y); + + const uint x_rhs = x / N0; + + lhs_offset_first_element_in_bytes += y * (uint)lhs_stride_y + z * (uint)lhs_stride_y * M; + rhs_offset_first_element_in_bytes += (x_rhs % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x_rhs / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset_first_element_in_bytes += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset_first_element_in_bytes += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + TILE(DATA_TYPE, M0, N0, c); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + + int i = 0; + for(; i <= (K - K0); i+=K0) + { + TILE(DATA_TYPE, M0, K0, a); + TILE(DATA_TYPE, N0, K0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, RHS_STEP_X * sizeof(DATA_TYPE), b); + + // Compute the matrix multiplication between the two tiles + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, c); + + lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE); + rhs_offset_first_element_in_bytes += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); + } +#if defined(RUN_LEFTOVER_K0) + for(; i < K; ++i) + { + TILE(DATA_TYPE, M0, 1, a); + TILE(DATA_TYPE, N0, 1, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, RHS_STEP_X * sizeof(DATA_TYPE), b); + + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, c); + + lhs_offset_first_element_in_bytes += sizeof(DATA_TYPE); + rhs_offset_first_element_in_bytes += sizeof(DATA_TYPE); + } +#endif // defined(RUN_LEFTOVER_K0) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + TILE(DATA_TYPE, 1, N0, bias0); + + T_LOAD_WIDTH_SELECT(DATA_TYPE, 1, N0, PARTIAL_STORE_N0, BUFFER, bia, x, 0, 0, x_cond, bias0); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, c, bias0, c); +#else // defined(BROADCAST_BIAS) + TILE(DATA_TYPE, M0, N0, bias0); + + bia_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * bia_stride_y) + (z * bia_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, bia, 0, bia_stride_y, x_cond, bias0, dst_indirect_y); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias + T_ADD(DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(POST_OP1) + T_ACTIVATION(DATA_TYPE, M0, N0, P1_ACTIVATION_TYPE, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP1) + +#if defined(POST_OP2) + TILE(DATA_TYPE, M0, N0, extra0); + + ex0_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * ex0_stride_y) + (z * ex0_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, ex0, 0, ex0_stride_y, x_cond, extra0, dst_indirect_y); + + T_ADD(DATA_TYPE, M0, N0, c, extra0, c); +#endif // defined(POST_OP2) + +#if defined(POST_OP3) + T_ACTIVATION(DATA_TYPE, M0, N0, P3_ACTIVATION_TYPE, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP3) + + dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * dst_stride_y) + (z * dst_stride_y * M); + + // Store the tile in reverse order so that the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, c, dst_indirect_y); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(GEMM_RESHAPED_RHS_ONLY_T) + +#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE) +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case of post ops, the following information must be passed at compile time: + * @note -DPOST_OP1, -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 1 + * @note -DPOST_OP2: The arithmetic addition post op to perform at slot 2 + * @note -DPOST_OP3, -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 + * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) + * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) + * @param[in] lhs_w The size of the width dimension of the LHS tensor + * @param[in] lhs_h The size of the height dimension of the LHS tensor + * @param[in] lhs_n The size of the depth dimension of the LHS tensor + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor + * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) + * @param[in] rhs_w The size of the width dimension of the RHS tensor + * @param[in] rhs_h The size of the height dimension of the RHS tensor + * @param[in] rhs_n The size of the depth dimension of the RHS tensor + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] ex0_ptr (Optional) Pointer to the tensor added with POST_OP2. Supported data type: same as @p lhs_ptr + * @param[in] ex0_stride_y (Optional) Stride of the tensor added with POST_OP2 in Y dimension (in bytes) + * @param[in] ex0_stride_z (Optional) Stride of the tensor added with POST_OP2 in Z dimension (in bytes) + * @param[in] ex0_w (Optional) The size of the width dimension of the tensor added with POST_OP2 + * @param[in] ex0_h (Optional) The size of the height dimension of the tensor added with POST_OP2 + * @param[in] ex0_n (Optional) The size of the depth dimension of the tensor added with POST_OP2 + * @param[in] ex0_offset_first_element_in_bytes (Optional) The offset of the first element in the tensor added with POST_OP2 + * @param[out] dst_ptr (Optional) Pointer to the destination tensor. Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_w (Optional) The size of the width dimension of the destination tensor + * @param[in] dst_h (Optional) The size of the height dimension of the destination tensor + * @param[in] dst_n (Optional) The size of the depth dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor + * @param[in] M Number of rows in LHS matrix not reshaped + * @param[in] N Number of columns in RHS matrix not reshaped + * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped + */ +//! @endcond +__kernel void gemm_mm_reshaped_only_rhs_t_texture( + TENSOR3D_T(lhs, BUFFER), + TENSOR3D_T(rhs, IMAGE), +#if defined(BETA) + TENSOR3D_T(bia, BUFFER), +#endif // defined(BETA) +#if defined(POST_OP2) + TENSOR3D_T(ex0, BUFFER), +#endif // defined(POST_OP_ADD) + TENSOR3D_T(dst, BUFFER), + const int M, + const int N, + const int K +) +{ + // Block size +#define RHS_BLOCK_SIZE (K0 * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X (K0 * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X K0 +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = GET_SPATIAL_IDX(0, N0, 0); + const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); + const uint z = GET_SPATIAL_IDX(2, 1, 0); + +#if defined(DUMMY_WORK_ITEMS) + if((x >= N) || (y >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + bool x_cond = PARTIAL_STORE_N0 != 0 && ((x + N0) > N); + bool y_cond = PARTIAL_STORE_M0 != 0 && y == 0; + + TILE(uint, M0, 1, dst_indirect_y); + INITIALIZE_INDIRECT_Y(M0, PARTIAL_STORE_M0, y_cond, dst_indirect_y); + + lhs_offset_first_element_in_bytes += y * (uint)lhs_stride_y + z * lhs_stride_y * M; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + const uint z_rhs = z % MATRIX_B_DEPTH; +#else // defined(MATRIX_B_DEPTH) + const uint z_rhs = z; +#endif // defined(MATRIX_B_DEPTH) + + uint x_rhs = ((x / N0) % H0) * (uint)RHS_OFFSET_X; + const uint y_rhs = ((x / N0) / H0) + z_rhs * rhs_h; + + // Initialize the accumulators + TILE(DATA_TYPE, M0, N0, c); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + + TILE(DATA_TYPE, M0, K0, a); + TILE(DATA_TYPE, N0, K0, b); + + int i = 0; + for(; i <= (K - K0); i+=K0) + { + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD_DILATED(DATA_TYPE, N0, K0, IMAGE, rhs, x_rhs, y_rhs, RHS_STEP_X, 0, 1, b); + + // Compute the matrix multiplication between the two tiles + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, c); + + lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE); + x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } +#if defined(RUN_LEFTOVER_K0) + T_LOAD_DILATED(DATA_TYPE, N0, K0, IMAGE, rhs, x_rhs, y_rhs, RHS_STEP_X, 0, 1, b); + + LOOP_UNROLLING(int, k0, 0, 1, PARTIAL_K, + { + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + DATA_TYPE a0 = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + m0 * lhs_stride_y); + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + c[m0].s[n0] += a0 * b[n0].s[k0]; + }) + }) + lhs_offset_first_element_in_bytes += sizeof(DATA_TYPE); + }) +#endif // defined(RUN_LEFTOVER_K0) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + TILE(DATA_TYPE, 1, N0, bias0); + + T_LOAD_WIDTH_SELECT(DATA_TYPE, 1, N0, PARTIAL_STORE_N0, BUFFER, bia, x, 0, 0, x_cond, bias0); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, c, bias0, c); +#else // defined(BROADCAST_BIAS) + TILE(DATA_TYPE, M0, N0, bias0); + + bia_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * bia_stride_y) + (z * bia_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, bia, 0, bia_stride_y, x_cond, bias0, dst_indirect_y); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias + T_ADD(DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(POST_OP1) + T_ACTIVATION(DATA_TYPE, M0, N0, P1_ACTIVATION_TYPE, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP1) + +#if defined(POST_OP2) + TILE(DATA_TYPE, M0, N0, extra0); + + ex0_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * ex0_stride_y) + (z * ex0_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, ex0, 0, ex0_stride_y, x_cond, extra0, dst_indirect_y); + + T_ADD(DATA_TYPE, M0, N0, c, extra0, c); +#endif // defined(POST_OP2) + +#if defined(POST_OP3) + T_ACTIVATION(DATA_TYPE, M0, N0, P3_ACTIVATION_TYPE, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP3) + + dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_y * M; + + // Store the tile in reverse order so that the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, c, dst_indirect_y); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(GEMM_RESHAPED_RHS_ONLY_T_TEXTURE) + +#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT) +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is not transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case of post ops, the following information must be passed at compile time: + * @note -DPOST_OP1, -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 1 + * @note -DPOST_OP2: The arithmetic addition post op to perform at slot 2 + * @note -DPOST_OP3, -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 + * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) + * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) + * @param[in] lhs_w The size of the width dimension of the LHS tensor + * @param[in] lhs_h The size of the height dimension of the LHS tensor + * @param[in] lhs_n The size of the depth dimension of the LHS tensor + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor + * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) + * @param[in] rhs_w The size of the width dimension of the RHS tensor + * @param[in] rhs_h The size of the height dimension of the RHS tensor + * @param[in] rhs_n The size of the depth dimension of the RHS tensor + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] ex0_ptr (Optional) Pointer to the tensor added with POST_OP2. Supported data type: same as @p lhs_ptr + * @param[in] ex0_stride_y (Optional) Stride of the tensor added with POST_OP2 in Y dimension (in bytes) + * @param[in] ex0_stride_z (Optional) Stride of the tensor added with POST_OP2 in Z dimension (in bytes) + * @param[in] ex0_w (Optional) The size of the width dimension of the tensor added with POST_OP2 + * @param[in] ex0_h (Optional) The size of the height dimension of the tensor added with POST_OP2 + * @param[in] ex0_n (Optional) The size of the depth dimension of the tensor added with POST_OP2 + * @param[in] ex0_offset_first_element_in_bytes (Optional) The offset of the first element in the tensor added with POST_OP2 + * @param[out] dst_ptr (Optional) Pointer to the destination tensor. Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_w (Optional) The size of the width dimension of the destination tensor + * @param[in] dst_h (Optional) The size of the height dimension of the destination tensor + * @param[in] dst_n (Optional) The size of the depth dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor + * @param[in] M Number of rows in LHS matrix not reshaped + * @param[in] N Number of columns in RHS matrix not reshaped + * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped + */ +//! @endcond +__kernel void gemm_mm_reshaped_only_rhs_nt( + TENSOR3D_T(lhs, BUFFER), + TENSOR3D_T(rhs, BUFFER), +#if defined(BETA) + TENSOR3D_T(bia, BUFFER), +#endif // defined(BETA) +#if defined(POST_OP2) + TENSOR3D_T(ex0, BUFFER), +#endif // defined(POST_OP_ADD) + TENSOR3D_T(dst, BUFFER), + const int M, + const int N, + const int K +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = GET_SPATIAL_IDX(0, N0, 0); + const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); + const uint z = GET_SPATIAL_IDX(2, 1, 0); + +#if defined(DUMMY_WORK_ITEMS) + if((x >= N) || (y >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + bool x_cond = PARTIAL_STORE_N0 != 0 && ((x + N0) > N); + bool y_cond = PARTIAL_STORE_M0 != 0 && y == 0; + + TILE(uint, M0, 1, dst_indirect_y); + INITIALIZE_INDIRECT_Y(M0, PARTIAL_STORE_M0, y_cond, dst_indirect_y); + + const uint x_rhs = x / N0; + + lhs_offset_first_element_in_bytes += y * (uint)lhs_stride_y + z * lhs_stride_y * M; + rhs_offset_first_element_in_bytes += (x_rhs % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x_rhs / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset_first_element_in_bytes += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset_first_element_in_bytes += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + TILE(DATA_TYPE, M0, N0, c); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + + int i = 0; + for(; i <= (K - K0); i+=K0) + { + TILE(DATA_TYPE, M0, K0, a); + TILE(DATA_TYPE, K0, N0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, RHS_STEP_X * sizeof(DATA_TYPE), b); + + // Compute the matrix multiplication between the two tiles + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, c); + + lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE); + rhs_offset_first_element_in_bytes += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); + } +#if defined(RUN_LEFTOVER_K0) + for(; i < K; ++i) + { + TILE(DATA_TYPE, M0, 1, a); + TILE(DATA_TYPE, 1, N0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, RHS_STEP_X * sizeof(DATA_TYPE), b); + + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, c); + + lhs_offset_first_element_in_bytes += sizeof(DATA_TYPE); + rhs_offset_first_element_in_bytes += RHS_STEP_X * sizeof(DATA_TYPE); + } +#endif // defined(RUN_LEFTOVER_K0) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + TILE(DATA_TYPE, 1, N0, bias0); + + T_LOAD_WIDTH_SELECT(DATA_TYPE, 1, N0, PARTIAL_STORE_N0, BUFFER, bia, x, 0, 0, x_cond, bias0); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, c, bias0, c); +#else // defined(BROADCAST_BIAS) + TILE(DATA_TYPE, M0, N0, bias0); + + bia_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * bia_stride_y) + (z * bia_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, bia, 0, bia_stride_y, x_cond, bias0, dst_indirect_y); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias + T_ADD(DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(POST_OP1) + T_ACTIVATION(DATA_TYPE, M0, N0, P1_ACTIVATION_TYPE, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP1) + +#if defined(POST_OP2) + TILE(DATA_TYPE, M0, N0, extra0); + + ex0_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * ex0_stride_y) + (z * ex0_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, ex0, 0, ex0_stride_y, x_cond, extra0, dst_indirect_y); + + T_ADD(DATA_TYPE, M0, N0, c, extra0, c); +#endif // defined(POST_OP2) + +#if defined(POST_OP3) + T_ACTIVATION(DATA_TYPE, M0, N0, P3_ACTIVATION_TYPE, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP3) + + dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_y * M; + + // Store the tile in reverse order so that the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, c, dst_indirect_y); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(GEMM_RESHAPED_RHS_ONLY_NT) + +#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE) +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops: + * Post op 1: activation (optional) + * Post op 2: elementwise op + * Post op 3: activation (optional) + * + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is not transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1) + * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case of post ops, the following information must be passed at compile time: + * @note -DPOST_OP1, -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 1 + * @note -DPOST_OP2: The arithmetic addition post op to perform at slot 2 + * @note -DPOST_OP3, -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3 + * + * @param[in] lhs_ptr Pointer to the LHS tensor. Supported data types: F16/F32 + * @param[in] lhs_stride_y Stride of the LHS tensor in Y dimension (in bytes) + * @param[in] lhs_stride_z Stride of the LHS tensor in Z dimension (in bytes) + * @param[in] lhs_w The size of the width dimension of the LHS tensor + * @param[in] lhs_h The size of the height dimension of the LHS tensor + * @param[in] lhs_n The size of the depth dimension of the LHS tensor + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor + * @param[in] rhs_ptr Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_y Stride of the RHS tensor in Y dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS tensor in Z dimension (in bytes) + * @param[in] rhs_w The size of the width dimension of the RHS tensor + * @param[in] rhs_h The size of the height dimension of the RHS tensor + * @param[in] rhs_n The size of the depth dimension of the RHS tensor + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr + * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor + * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor + * @param[in] bia_n (Optional) The size of the depth dimension of the bias tensor + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] ex0_ptr (Optional) Pointer to the tensor added with POST_OP2. Supported data type: same as @p lhs_ptr + * @param[in] ex0_stride_y (Optional) Stride of the tensor added with POST_OP2 in Y dimension (in bytes) + * @param[in] ex0_stride_z (Optional) Stride of the tensor added with POST_OP2 in Z dimension (in bytes) + * @param[in] ex0_w (Optional) The size of the width dimension of the tensor added with POST_OP2 + * @param[in] ex0_h (Optional) The size of the height dimension of the tensor added with POST_OP2 + * @param[in] ex0_n (Optional) The size of the depth dimension of the tensor added with POST_OP2 + * @param[in] ex0_offset_first_element_in_bytes (Optional) The offset of the first element in the tensor added with POST_OP2 + * @param[out] dst_ptr (Optional) Pointer to the destination tensor. Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_w (Optional) The size of the width dimension of the destination tensor + * @param[in] dst_h (Optional) The size of the height dimension of the destination tensor + * @param[in] dst_n (Optional) The size of the depth dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor + * @param[in] M Number of rows in LHS matrix not reshaped + * @param[in] N Number of columns in RHS matrix not reshaped + * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped + */ +//! @endcond +__kernel void gemm_mm_reshaped_only_rhs_nt_texture( + TENSOR3D_T(lhs, BUFFER), + TENSOR3D_T(rhs, IMAGE), +#if defined(BETA) + TENSOR3D_T(bia, BUFFER), +#endif // defined(BETA) +#if defined(POST_OP2) + TENSOR3D_T(ex0, BUFFER), +#endif // defined(POST_OP_ADD) + TENSOR3D_T(dst, BUFFER), + const int M, + const int N, + const int K +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = GET_SPATIAL_IDX(0, N0, 0); + const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); + const uint z = GET_SPATIAL_IDX(2, 1, 0); + +#if defined(DUMMY_WORK_ITEMS) + if((x >= N) || (y >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + bool x_cond = PARTIAL_STORE_N0 != 0 && ((x + N0) > N); + bool y_cond = PARTIAL_STORE_M0 != 0 && y == 0; + + TILE(uint, M0, 1, dst_indirect_y); + INITIALIZE_INDIRECT_Y(M0, PARTIAL_STORE_M0, y_cond, dst_indirect_y); + + lhs_offset_first_element_in_bytes += y * (uint)lhs_stride_y + z * lhs_stride_y * M; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + const uint z_rhs = z % MATRIX_B_DEPTH; +#else // defined(MATRIX_B_DEPTH) + const uint z_rhs = z; +#endif // defined(MATRIX_B_DEPTH) + + uint x_rhs = ((x / N0) % H0) * (uint)RHS_OFFSET_X; + const uint y_rhs = ((x / N0) / H0) + z_rhs * rhs_h; + + // Initialize the accumulators + TILE(DATA_TYPE, M0, N0, c); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + c[i].v = 0; + }) + + int i = 0; + for(; i <= (K - K0); i+=K0) + { + TILE(DATA_TYPE, M0, K0, a); + TILE(DATA_TYPE, K0, N0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD_DILATED(DATA_TYPE, K0, N0, IMAGE, rhs, x_rhs, y_rhs, RHS_STEP_X, 0, 1, b); + + // Compute the matrix multiplication between the two tiles + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, c); + + lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE); + x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP; + } + +#if defined(RUN_LEFTOVER_K0) + for(; i < K; ++i) + { + TILE(DATA_TYPE, M0, 1, a); + TILE(DATA_TYPE, 1, N0, b); + + // Load tile from the lhs/rhs tensors + T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a); + T_LOAD_DILATED(DATA_TYPE, 1, N0, IMAGE, rhs, x_rhs, y_rhs, RHS_STEP_X, 0, 1, b); + + T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, c); + + lhs_offset_first_element_in_bytes += sizeof(DATA_TYPE); + x_rhs += RHS_STEP_X; + } +#endif // defined(RUN_LEFTOVER_K0) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + TILE(DATA_TYPE, 1, N0, bias0); + + T_LOAD_WIDTH_SELECT(DATA_TYPE, 1, N0, PARTIAL_STORE_N0, BUFFER, bia, x, 0, 0, x_cond, bias0); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, c, bias0, c); +#else // defined(BROADCAST_BIAS) + TILE(DATA_TYPE, M0, N0, bias0); + + bia_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * bia_stride_y) + (z * bia_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, bia, 0, bia_stride_y, x_cond, bias0, dst_indirect_y); + +#ifndef UNIT_BETA + T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0); +#endif // UNIT_BIAS + + // c = c + bias + T_ADD(DATA_TYPE, M0, N0, c, bias0, c); + // c = c + bias +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(POST_OP1) + T_ACTIVATION(DATA_TYPE, M0, N0, P1_ACTIVATION_TYPE, P1_ACTIVATION_A_VAL, P1_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP1) + +#if defined(POST_OP2) + TILE(DATA_TYPE, M0, N0, extra0); + + ex0_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + (y * ex0_stride_y) + (z * ex0_stride_y * M); + + T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, ex0, 0, ex0_stride_y, x_cond, extra0, dst_indirect_y); + + T_ADD(DATA_TYPE, M0, N0, c, extra0, c); +#endif // defined(POST_OP2) + +#if defined(POST_OP3) + T_ACTIVATION(DATA_TYPE, M0, N0, P3_ACTIVATION_TYPE, P3_ACTIVATION_A_VAL, P3_ACTIVATION_B_VAL, c, c); +#endif // defined(POST_OP3) + + dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_y * M; + + // Store the tile in reverse order so that the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, c, dst_indirect_y); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(GEMM_RESHAPED_RHS_ONLY_NT_TEXTURE) \ No newline at end of file diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 5706248e98..6a77463325 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -475,6 +475,106 @@ }) \ }) +/** Load a tile from global memory (tensor) and conditionally use a different length for the load + * + * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length + * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones + * + * @param[in] DATA_TYPE Data type + * @param[in] HEIGHT Number of dst rows + * @param[in] WIDTH0 Load width to use if WIDTH1_CONDITION = false + * @param[in] WIDTH1 Load width to use if WIDTH1_CONDITION = true + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] X Starting X position + * @param[in] Y Starting Y position + * @param[in] STRIDE_Y Stride Y (in bytes) used to load each row. + * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store + * @param[out] dst Output tile + */ +#define T_LOAD_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, WIDTH1_CONDITION, dst) \ + ({ \ + if(WIDTH1_CONDITION) \ + { \ + LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ + { \ + VLOAD_PARTIAL(WIDTH0, WIDTH1) \ + (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * STRIDE_Y)); \ + }) \ + } \ + else \ + { \ + LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ + { \ + dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y); \ + }) \ + } \ + }) + +/** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load + * + * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length + * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones + * + * @param[in] DATA_TYPE Data type + * @param[in] HEIGHT Number of dst rows + * @param[in] WIDTH0 Load width to use if WIDTH1_CONDITION = false + * @param[in] WIDTH1 Load width to use if WIDTH1_CONDITION = true + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] X Starting X position + * @param[in] STRIDE_Y Stride Y (in bytes) used to load each row. + * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store + * @param[out] dst Output tile + * @param[in] indirect_y Indirect Y index tile + */ +#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \ + ({ \ + if(WIDTH1_CONDITION) \ + { \ + LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ + { \ + VLOAD_PARTIAL(WIDTH0, WIDTH1) \ + (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ + }) \ + } \ + else \ + { \ + LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ + { \ + dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \ + }) \ + } \ + }) + +/** Load a tile from global memory (tensor) with dilation for the X and Y direction + * + * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length + * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones + * + * @param[in] DATA_TYPE Data type + * @param[in] HEIGHT Number of dst rows + * @param[in] WIDTH Number of dst columns + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] X Starting X position + * @param[in] Y Starting Y position + * @param[in] XI_MULTIPLIER Dilation for the X increment + * @param[in] YI_MULTIPLIER Dilation for the Y increment + * @param[in] STRIDE_Y Stride Y (in bytes) used to load each row. + * @param[out] dst Output tile + */ +#define T_LOAD_DILATED(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, XI_MULTIPLIER, YI_MULTIPLIER, STRIDE_Y, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, ((X) + _i * (int)(XI_MULTIPLIER)), ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ + }) \ + }) + /** Load a tile from global memory (tensor) using an indirect Y index tile * * @param[in] DATA_TYPE Data type @@ -986,6 +1086,25 @@ }) \ }) +/** Element-wise addition between two tiles + * + * @note Performs: LHS + RHS = DST + * + * @param[in] DATA_TYPE LHS/RHS/DST data type + * @param[in] M0 Number of LHS rows + * @param[in] N0 Number of LHS columns + * @param[in] lhs LHS tile + * @param[in] rhs Constant LHS tile + * @param[out] dst DST tile + */ +#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ + ({ \ + LOOP_UNROLLING(int, _m0, 0, 1, M0, \ + { \ + dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ + }) \ + }) + /** Element-wise addition with a constant value * * @note Performs: LHS + constant = DST @@ -1001,10 +1120,26 @@ ({ \ LOOP_UNROLLING(int, _m0, 0, 1, M0, \ { \ - LOOP_UNROLLING(int, _n0, 0, 1, N0, \ - { \ - dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \ - }) \ + dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ + }) \ + }) + +/** Element-wise scale with a constant value + * + * @note Performs: LHS * constant = DST + * + * @param[in] DATA_TYPE LHS/RHS/DST data type + * @param[in] M0 Number of LHS rows + * @param[in] N0 Number of LHS columns + * @param[in] lhs LHS tile + * @param[in] rhs_constant Constant value + * @param[out] dst DST tile + */ +#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ + ({ \ + LOOP_UNROLLING(int, _m0, 0, 1, M0, \ + { \ + dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ }) \ }) @@ -1066,6 +1201,26 @@ }) \ } +#define T_MMUL_NT_NT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) +#define T_MMUL_NT_NT_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) +#define T_MMUL_NT_NT_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) +#define T_MMUL_NT_NT_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) +#define T_MMUL_NT_NT_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) +#define T_MMUL_NT_NT_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_NT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) +#define T_MMUL_NT_NT_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ + { \ + LOOP_UNROLLING(int, _m, 0, 1, M0, \ + { \ + LOOP_UNROLLING(int, _n, 0, 1, N0, \ + { \ + LOOP_UNROLLING(int, _k, 0, 1, K0, \ + { \ + dst[_m].s[_n] = fma((lhs[_m].s[_k]), (rhs[_k].s[_n]), dst[_m].s[_n]); \ + }) \ + }) \ + }) \ + } + #define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ ({ \ LOOP_UNROLLING(int, _m, 0, 1, M0, \ @@ -1076,3 +1231,26 @@ }) \ }) \ }) + +/** Initialize indirect Y for avoiding out-of-bound reads/writes + * + * @param[in] M0 Tile height to use if CONDITION = false + * @param[in] M1 Tile height to use if CONDITION = true + * @param[in] COND Condition to select the M1 tile height + * @param[out] indirect_y Indirect tile + */ +#define INITIALIZE_INDIRECT_Y(M0, M1, COND, indirect_y) \ + if(COND) \ + { \ + LOOP_UNROLLING(int, _i, 0, 1, M0, \ + { \ + indirect_y[_i].v = min(_i, (int)M1 - 1); \ + }) \ + } \ + else \ + { \ + LOOP_UNROLLING(int, _i, 0, 1, M0, \ + { \ + indirect_y[_i].v = _i; \ + }) \ + } diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index 92a9d9c25a..856d37766a 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -281,14 +281,10 @@ const std::map ClKernelLibrary::_kernel_program_map = { "gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, { "gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, { "gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl" }, - { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" }, - { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" }, - { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" }, + { "gemm_mm_reshaped_only_rhs_nt", "common/gemm_reshaped_rhs_only.cl" }, + { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm_reshaped_rhs_only.cl" }, + { "gemm_mm_reshaped_only_rhs_t", "common/gemm_reshaped_rhs_only.cl" }, + { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm_reshaped_rhs_only.cl" }, { "gemm_lc_vm_f32", "common/gemm.cl" }, { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" }, { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" }, @@ -588,6 +584,10 @@ const std::map ClKernelLibrary::_program_source_map = { "common/gemm_utils.cl", #include "./cl_kernels/common/gemm_utils.clembed" + }, + { + "common/gemm_reshaped_rhs_only.cl", +#include "./cl_kernels/common/gemm_reshaped_rhs_only.clembed" }, { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl", @@ -596,10 +596,6 @@ const std::map ClKernelLibrary::_program_source_map = { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl", #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed" - }, - { - "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl", -#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed" }, { "common/gemmlowp.cl", diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp index 29f9180bf4..546a61e264 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -83,6 +83,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons && (!gemm_info.broadcast_bias), "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.has_pad_y, "Tensors cannot have padding along the Y direction"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported"); @@ -142,17 +143,8 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens ARM_COMPUTE_UNUSED(src0, src1, src2); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - // This approach should only be used when the input/dst tensors have pad on the y direction - if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) - { - reinterpret_output_as_3d = false; - } - TensorInfo tmp_info(*dst); if(reinterpret_output_as_3d) @@ -200,19 +192,10 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _add_bias = src2 != nullptr; _export_to_cl_image = rhs_info.export_to_cl_image; - _has_pad_y = gemm_info.has_pad_y; _num_post_op_args = gemm_info.post_ops.total_num_arguments(); auto padding_info = get_padding_info({ src0, src1, src2, dst }); - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - // Check if we need to slide the matrix B const unsigned int num_dimensions_src0 = src0->num_dimensions(); _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); @@ -229,10 +212,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); - // These variables are used only if gemm_info.has_pad_y == true - const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); - const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. // NOTE: This might have implications on heuristics and performance const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); @@ -243,31 +222,26 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext _m = internal_m; _n = gemm_info.n; _k = gemm_info.k; + // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); - build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); - build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - if(_has_pad_y) - { - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); - } + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(gemm_info.k % rhs_info.k0, "-DRUN_LEFTOVER_K0"); + build_opts.add_option_if((gemm_info.k % rhs_info.k0) && rhs_info.transpose, "-DPARTIAL_K=" + support::cpp11::to_string(gemm_info.k % rhs_info.k0)); + // If post_ops are used, then we disable the use of gemm_info.activation_info if(gemm_info.post_ops.size() > 0) { @@ -275,15 +249,15 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext } else { - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DPOST_OP1"); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DP1_ACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DP1_ACTIVATION_A_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DP1_ACTIVATION_B_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); } std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; - kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; - post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops); + kernel_name += _export_to_cl_image ? "_texture" : ""; // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -294,11 +268,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext // Set config_id for enabling LWS tuning _config_id = kernel_name; _config_id += "_"; - _config_id += (_has_pad_y ? "" : "no_pad_y_"); _config_id += (_add_bias ? "add_bias_" : ""); _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); _config_id += lower_string(string_from_data_type(src0->data_type())); _config_id += "_"; @@ -350,24 +321,12 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); } - const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u; - const size_t rhs_idx_batch_size = 2u; - const size_t bia_idx_batch_size = 2u; - const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u; - Window slice = window.first_slice_window_3D(); Window slice_matrix_b = slice; slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - // Get cross plane pads - const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom; - const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom; - - // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor - ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); - cl::Image2D src1_image2d; if(_export_to_cl_image) @@ -391,63 +350,30 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con unsigned int idx = 0; // LHS buffer - add_2D_tensor_argument(idx, src0, slice); + add_3d_tensor_nhw_argument(idx, src0); // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) if(_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } - else - { - add_2D_tensor_argument(idx, src1, slice_b); - } + add_3d_tensor_nhw_argument(idx, src1); // Bias buffer (_add_bias == true) - add_2D_tensor_argument_if(_add_bias, idx, src2, slice); - - // dst buffer - add_2D_tensor_argument(idx, dst, slice); - - // post op argument buffers - for(size_t i = 0; i < _num_post_op_args; ++i) - { - const auto post_op_arg = utils::cast::polymorphic_downcast(tensors.get_const_tensor(experimental::get_post_op_arg_type(i))); - add_2D_tensor_argument(idx, post_op_arg, slice); - } - - // LHS stride_z - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[lhs_idx_batch_size])); - - // RHS stride_z (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[rhs_idx_batch_size])); - - // Bias stride_z (if _add_bias == true) if(_add_bias) { - _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[bia_idx_batch_size])); + add_3d_tensor_nhw_argument(idx, src2); } - // dst stride_z - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[out_idx_batch_size])); - // post op argument stride_z + // post op argument buffers for(size_t i = 0; i < _num_post_op_args; ++i) { const auto post_op_arg = utils::cast::polymorphic_downcast(tensors.get_const_tensor(experimental::get_post_op_arg_type(i))); - _kernel.setArg(idx++, static_cast(post_op_arg->info()->strides_in_bytes()[2])); + add_3d_tensor_nhw_argument(idx, post_op_arg); } - // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d && _has_pad_y) - { - _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); - } - - // Cross-plan padding (if reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d && _has_pad_y) - { - _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); - } + // dst buffer + add_3d_tensor_nhw_argument(idx, dst); // Pass m, n and k at runtime as signed ints, to ensure results of any subractions they could be operand in, would still be signed. _kernel.setArg(idx++, _m); diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h index ec5878d5cc..297e681895 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -96,7 +96,6 @@ private: bool _use_dummy_work_items{ false }; bool _add_bias{ false }; bool _export_to_cl_image{ false }; - bool _has_pad_y{ false }; signed int _m{ 1 }; signed int _n{ 1 }; signed int _k{ 1 }; diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp index 555738531a..766aec339e 100644 --- a/src/gpu/cl/operators/ClGemm.cpp +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -452,9 +452,6 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf kernel_info.has_pad_y = false; ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - kernel_info.has_pad_y = true; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); - return Status{}; } diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp index ca63d3a679..860082f32b 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -107,6 +107,12 @@ const auto act_values = framework::dataset::make("Activation", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 10.f), }); +/** Activation values to test */ +const auto act_identity = framework::dataset::make("Activation", +{ + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::IDENTITY), +}); + /** M0 values to test - precommit */ const auto m0_values_precommit = framework::dataset::make("M0", { 4 }); @@ -158,8 +164,8 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com framework::dataset::make("export_to_cl_image_rhs", {true, false})), // Only need to test F32 as F16 shares identical boundary handling logics framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("alpha", -0.75f )), - framework::dataset::make("beta", -0.35f )), + framework::dataset::make("alpha", 1.0f )), + framework::dataset::make("beta", 0.0f )), broadcast_bias_values), framework::dataset::make("Activation", ActivationLayerInfo())); @@ -170,7 +176,7 @@ experimental::PostOpList post_ops_1() experimental::PostOpList post_ops{}; post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F}); post_ops.push_back_op>( - std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2 + std::make_tuple(false, false, false), 0, ConvertPolicy::SATURATE); post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); @@ -180,7 +186,7 @@ experimental::PostOpList post_ops_2() { experimental::PostOpList post_ops{}; post_ops.push_back_op>( - std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2 + std::make_tuple(false, false, false), 1, ConvertPolicy::SATURATE); post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); @@ -189,44 +195,18 @@ experimental::PostOpList post_ops_2() experimental::PostOpList post_ops_3() { experimental::PostOpList post_ops{}; - post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); post_ops.push_back_op>( - std::make_tuple(false, false, true), // If broadcast in dims 0, 1 and 2 + std::make_tuple(false, false, false), 1, ConvertPolicy::SATURATE); return post_ops; } -// To test that the output of the main op is the first parameter in prelu post op -experimental::PostOpList post_ops_4() -{ - experimental::PostOpList post_ops{}; - post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F}); - post_ops.push_back_op>( - std::make_tuple(false, false, true), // If true, broadcast in corresponding dim: 0, 1 or 2 - 0, - ConvertPolicy::SATURATE); - post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); - return post_ops; -} -// To test that the output of the main op is the second parameter in prelu post op i.e. it is the alpha_param -experimental::PostOpList post_ops_5() -{ - experimental::PostOpList post_ops{}; - post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F}); - post_ops.push_back_op>( - std::make_tuple(false, false, false), // If true, broadcast in corresponding dim: 0, 1 or 2 - 1, - ConvertPolicy::SATURATE); - post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); - return post_ops; -} + /** Different Post Op Lists */ const auto post_op_lists = framework::dataset::make("post_op_lists", { post_ops_1(), post_ops_2(), - post_ops_3(), - post_ops_4(), - post_ops_5() + post_ops_3() } ); bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList& post_ops) @@ -466,20 +446,7 @@ TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); } -TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL) -{ - const auto data_type = DataType::F32; - const unsigned int m = 22; - const unsigned int n = 16; - const unsigned int k = 15; - const unsigned int batch = 3; - TensorShape post_op_arg_shape(1, 1, 1); - TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); - experimental::PostOpList post_ops{}; - post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); - ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); -} TEST_SUITE_END() // Valid TEST_SUITE_END() // ValidateFusedPostOps TEST_SUITE(Float) @@ -633,7 +600,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", {false, true})), - framework::dataset::make("has_pad_y", {false, true})), + framework::dataset::make("has_pad_y", {false})), framework::dataset::make("DataType", DataType::F32)), a_values), beta_values), @@ -665,7 +632,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", {false, true})), - framework::dataset::make("has_pad_y", {false, true})), + framework::dataset::make("has_pad_y", {false})), framework::dataset::make("DataType", DataType::F32)), a_values), beta_values), @@ -702,7 +669,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPost a_values), beta_values), framework::dataset::make("broadcast_bias", { false } )), - act_values), + act_identity), post_op_lists) ) { @@ -799,7 +766,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", true)), - framework::dataset::make("has_pad_y", {false, true})), + framework::dataset::make("has_pad_y", {false})), framework::dataset::make("DataType", DataType::F16)), a_values), beta_values), @@ -831,7 +798,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur i_values_rhs), t_values_rhs), framework::dataset::make("export_to_cl_image_rhs", true)), - framework::dataset::make("has_pad_y", {false, true})), + framework::dataset::make("has_pad_y", {false})), framework::dataset::make("DataType", DataType::F16)), a_values), beta_values), @@ -867,7 +834,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPost a_values), beta_values), framework::dataset::make("broadcast_bias", { false } )), - act_values), + act_identity), post_op_lists) ) { diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index 52cd6759a7..95dcd70104 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -1551,6 +1551,7 @@ public: const TensorShape bias_shape(n, broadcast_bias ? 1 : m, broadcast_bias ? 1 : batch_size); + auto post_ops_with_shapes = experimental::transform_post_op_list_arguments(post_ops, [ = ](auto broadcast) { -- cgit v1.2.1