aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCongLi <sicong.li@arm.com>2021-10-24 19:12:33 +0100
committerSiCong Li <sicong.li@arm.com>2021-11-02 10:41:11 +0000
commitafa19725f7f3feb2c21a6aed02ade49d08e3097b (patch)
treed796239f542cf447060e6fa6d1240fecb3d6c7a6
parent579ca84bd8ef5a91eded65c4dc5e0b9f7de8bef1 (diff)
downloadComputeLibrary-afa19725f7f3feb2c21a6aed02ade49d08e3097b.tar.gz
Add post ops to ClGemmMatrixMultiplyReshapedOnlyRHSKernel and ClGemmMatrixMultiplyNativeKernel Part 3
Partially resolves: COMPMID-4435 Change-Id: Ifc5affa3a24a70942ca2d001380205df09b03ad7 Signed-off-by: SiCongLi <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6550 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp2
-rw-r--r--SConscript2
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl366
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl4
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl1375
-rw-r--r--src/core/CL/cl_kernels/common/gemm.cl5
-rw-r--r--src/gpu/cl/ClKernelLibrary.cpp13
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp55
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h11
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp49
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h15
-rw-r--r--src/gpu/cl/operators/ClGemm.cpp20
-rw-r--r--tests/framework/Macros.h10
-rw-r--r--tests/validation/CL/GEMMMatrixMultiplyNative.cpp218
-rw-r--r--tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp270
-rw-r--r--tests/validation/fixtures/GEMMFixture.h444
16 files changed, 2821 insertions, 38 deletions
diff --git a/Android.bp b/Android.bp
index 4ec0475605..32d5805e59 100644
--- a/Android.bp
+++ b/Android.bp
@@ -28,7 +28,9 @@ opencl_srcs = [
"src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl",
"src/core/CL/cl_kernels/common/elementwise_unary.cl",
"src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h",
+ "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
"src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
+ "src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
"src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_elementwise_op_helpers.h",
"src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h",
"src/core/CL/cl_kernels/common/fft.cl",
diff --git a/SConscript b/SConscript
index 468d7388cd..c5c6ca3a0c 100644
--- a/SConscript
+++ b/SConscript
@@ -310,7 +310,9 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/common/floor.cl',
'src/core/CL/cl_kernels/common/gather.cl',
'src/core/CL/cl_kernels/common/gemm.cl',
+ 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
+ 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
'src/core/CL/cl_kernels/common/gemv.cl',
'src/core/CL/cl_kernels/common/gemmlowp.cl',
'src/core/CL/cl_kernels/common/generate_proposals.cl',
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
new file mode 100644
index 0000000000..e53ce3d1b2
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "fp_post_ops_act_eltwise_op_act.h"
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+/** (EXPERIMENTAL_POST_OPS) gemm_mm_native kernel */
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+
+#define VFMA(a, b, c) \
+ ({ \
+ c = fma(a, b, c); \
+ })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+ })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_native, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_native_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst),
+ // Post-Op arguments
+ IMAGE_DECLARATION(eltwise_operand),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif //defined(BETA)
+ uint dst_stride_z,
+ uint eltwise_operand_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ int i = 0;
+#if K0 > 1
+ for(; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+ RHS_VFMA_M0xN0(0, a, b0, c);
+ RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+ RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+ RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+ RHS_VFMA_M0xN0(4, a, b4, c);
+ RHS_VFMA_M0xN0(5, a, b5, c);
+ RHS_VFMA_M0xN0(6, a, b6, c);
+ RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+ RHS_VFMA_M0xN0(8, a, b8, c);
+ RHS_VFMA_M0xN0(9, a, b9, c);
+ RHS_VFMA_M0xN0(A, a, bA, c);
+ RHS_VFMA_M0xN0(B, a, bB, c);
+ RHS_VFMA_M0xN0(C, a, bC, c);
+ RHS_VFMA_M0xN0(D, a, bD, c);
+ RHS_VFMA_M0xN0(E, a, bE, c);
+ RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += K0 * rhs_stride_y;
+ }
+#endif // K0 > 1
+ // Left-over accumulations
+ for(; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+ RHS_VFMA_M0xN0(0, a, b, c);
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += rhs_stride_y;
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
+ const bool cond_y = ((y + 1) * M0 >= M);
+ const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+ 2) * bias_stride_z;
+
+ LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ // c = act(c)
+ POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+ // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+ POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+ // c = act(c)
+ POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+ // Store output block
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
index 9404c5e6db..758fd327fe 100644
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl
@@ -352,6 +352,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_post_act_eltwise_op_act(IMAGE_DECLAR
REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
@@ -568,6 +569,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture_post_act_eltwise_op_act(IMAG
REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
@@ -824,6 +826,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLAR
const uint y = get_global_id(1);
const uint z = get_global_id(2);
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
@@ -1326,6 +1329,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture_post_act_eltwise_op_act(IMAG
REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
new file mode 100644
index 0000000000..508ee96d2d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
@@ -0,0 +1,1375 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "fp_post_ops_act_eltwise_op_act.h"
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+ ({ \
+ c = fma(a, b, c); \
+ })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT2(a, b, c); \
+ c = fma((a.s2), (b.s2), c); \
+ })
+#define ARM_DOT4(a, b, c) \
+ ({ \
+ ARM_DOT3(a, b, c); \
+ c = fma((a.s3), (b.s3), c); \
+ })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##8), (c.s8)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##9), (c.s9)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##A), (c.sA)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##B), (c.sB)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##C), (c.sC)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##D), (c.sD)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##E), (c.sE)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst),
+ // Post-Op arguments
+ IMAGE_DECLARATION(eltwise_operand),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif //defined(BETA)
+ uint dst_stride_z,
+ uint eltwise_operand_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS reshaped matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ int i = 0;
+ for(; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS reshaped matrix
+ LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ }
+
+ // Left-over accumulations
+ for(; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS reshaped matrix
+ LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+ // Accumulate
+ ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
+ const bool cond_y = ((y + 1) * M0 >= M);
+ const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * bias_stride_y) + z * bias_stride_z;
+
+ LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ // c = act(c)
+ POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+ // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+ POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+ // c = act(c)
+ POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+ // Store output block
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_t_texture, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+ __read_only image2d_t rhs_img,
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst),
+ // Post-Op arguments
+ IMAGE_DECLARATION(eltwise_operand),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif //defined(BETA)
+ uint dst_stride_z,
+ uint eltwise_operand_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+#define LEFTOVER_K (K % K0)
+
+ // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else // defined(MATRIX_B_DEPTH)
+ const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Compute RHS matrix coordinates
+ uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+ const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+ int i = 0;
+ for(; i <= (K - K0); i += K0)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix stored in a cl_image
+ REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+ LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+ }
+
+#if LEFTOVER_K != 0
+ // Note: We cannot read out-of-bound elements from the RHS matrix because
+ // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+
+ union UNION_VEC_TYPE
+ {
+ DATA_TYPE s[K0];
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ v;
+ };
+
+ union UNION_VEC_TYPE a0 = {.v = 0 };
+#if M0 > 1
+ union UNION_VEC_TYPE a1 = {.v = 0 };
+#endif // M0 > 1
+#if M0 > 2
+ union UNION_VEC_TYPE a2 = {.v = 0 };
+#endif // M0 > 2
+#if M0 > 3
+ union UNION_VEC_TYPE a3 = {.v = 0 };
+#endif // M0 > 3
+#if M0 > 4
+ union UNION_VEC_TYPE a4 = {.v = 0 };
+#endif // M0 > 4
+#if M0 > 5
+ union UNION_VEC_TYPE a5 = {.v = 0 };
+#endif // M0 > 5
+#if M0 > 6
+ union UNION_VEC_TYPE a6 = {.v = 0 };
+#endif // M0 > 6
+#if M0 > 7
+ union UNION_VEC_TYPE a7 = {.v = 0 };
+#endif // M0 > 7
+
+ REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+
+ // Load from RHS matrix
+ LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+ // Load from LHS matrix
+ for(int k = 0; k < LEFTOVER_K; ++k)
+ {
+ a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+#if M0 > 1
+ a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+#endif // M0 > 1
+#if M0 > 2
+ a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+#endif // M0 > 2
+#if M0 > 3
+ a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+#endif // M0 > 3
+#if M0 > 4
+ a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+#endif // M0 > 4
+#if M0 > 5
+ a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+#endif // M0 > 5
+#if M0 > 6
+ a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+#endif // M0 > 6
+#if M0 > 7
+ a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+#endif // M0 > 7
+
+ lhs_offset += sizeof(DATA_TYPE);
+ }
+
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0.v, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(K0, a1.v, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(K0, a2.v, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(K0, a3.v, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(K0, a4.v, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(K0, a5.v, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(K0, a6.v, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(K0, a7.v, b, c7);
+#endif // M0 > 7
+
+#endif // LEFTOVER_K != 0
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
+ const bool cond_y = ((y + 1) * M0 >= M);
+ const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * bias_stride_y) + z * bias_stride_z;
+
+ LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ // c = act(c)
+ POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+ // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+ POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+ // c = act(c)
+ POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+ // Store output block
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LEFTOVER_K
+#undef PIXEL_UNIT
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
+#define VFMA(a, b, c) \
+ ({ \
+ c = fma(a, b, c); \
+ })
+
+#if M0 == 1
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ })
+#elif M0 == 2 // M0 == 2
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ })
+#elif M0 == 3 // M0 == 3
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ })
+#elif M0 == 4 // M0 == 4
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ })
+#elif M0 == 5 // M0 == 5
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ })
+#elif M0 == 6 // M0 == 6
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ })
+#elif M0 == 7 // M0 == 7
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ })
+#elif M0 == 8 // M0 == 8
+#define VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+ })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst),
+ // Post-Op arguments
+ IMAGE_DECLARATION(eltwise_operand),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif //defined(BETA)
+ uint dst_stride_z,
+ uint eltwise_operand_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS reshaped matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zin, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+ int i = 0;
+ for(; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b0;
+
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(0, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(4, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(5, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(6, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(8, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(9, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(A, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(B, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(C, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(D, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(E, a, b0, c);
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+ }
+
+ // Left-over accumulations
+ for(; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b0;
+
+ b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+ VFMA_M0xN0(0, a, b0, c);
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
+ const bool cond_y = ((y + 1) * M0 >= M);
+ const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * bias_stride_y) + z * bias_stride_z;
+
+ LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ // c = act(c)
+ POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+ // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+ POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+ // c = act(c)
+ POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+ // Store output block
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
+ * Post op 1: activation (optional)
+ * Post op 2: elementwise op
+ * Post op 3: activation (optional)
+ *
+ * @note (Optional) -DP1_ACTIVATION_TYPE, -DP1_ACTIVATION_A_VAL, -DP1_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ * @note (Required) -DP2_ELTWISE_OP: The (binary) elementwise post op to perform
+ * @note (Required) -DP2_ELTWISE_ARG1_HEIGHT: The height (Y dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Required) -DP2_ELTWISE_ARG1_WIDTH: The width (X dimension) of the eltwise operand matrix of the eltwise post op at slot 2
+ * @note (Optional) -DP3_ACTIVATION_TYPE, -DP3_ACTIVATION_A_VAL, -DP3_ACTIVATION_B_VAL: The activation type, alpha and beta values of the activation post op at slot 3
+ *
+ * All parameters are similarly defined in kernel gemm_mm_reshaped_only_rhs_nt_texture, with these additions:
+ *
+ * @param[in] eltwise_operand_ptr Pointer to the eltwise operand matrix. Supported data type: F16/F32
+ * @param[in] eltwise_operand_stride_x Stride of the eltwise operand matrix in X dimension (in bytes)
+ * @param[in] eltwise_operand_step_x eltwise_operand_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
+ * @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
+ __read_only image2d_t rhs_img,
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst),
+ // Post-Op arguments
+ IMAGE_DECLARATION(eltwise_operand),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif //defined(BETA)
+ uint dst_stride_z,
+ uint eltwise_operand_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else // defined(MATRIX_B_DEPTH)
+ const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Compute RHS matrix coordinates
+ uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+ const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zin, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+ int i = 0;
+ for(; i <= (K - K0); i += K0)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b0;
+
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(0, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(4, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(5, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(6, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(8, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(9, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(A, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(B, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(C, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(D, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(E, a, b0, c);
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+ VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
+ }
+
+ // Left-over accumulations
+ for(; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b0;
+ b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+ VFMA_M0xN0(0, a, b0, c);
+
+ lhs_offset += sizeof(DATA_TYPE);
+ x_rhs += RHS_STEP_X;
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+ // Boundary conditions: detect if current block is at the "bottom" or "right" boundary
+ const bool cond_y = ((y + 1) * M0 >= M);
+ const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * M0 * bias_stride_y) + z * bias_stride_z;
+
+ LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ // c = act(c)
+ POST_OP1_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+ // c = c + eltwise_operand (mix-precision, broadcast, boundary aware)
+ POST_OP2_ELTWISE_OP(P2_ELTWISE_OP, M0, N0, c, eltwise_operand, DATA_TYPE, DATA_TYPE_ACCUMULATOR, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+ // c = act(c)
+ POST_OP3_ACTIVATION_OPTIONAL(M0, DATA_TYPE, DATA_TYPE_ACCUMULATOR, N0, c);
+
+ // Store output block
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index 9732588b31..6502dd496a 100644
--- a/src/core/CL/cl_kernels/common/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -1000,6 +1000,7 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
@@ -1294,6 +1295,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -1720,6 +1722,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
@@ -2038,6 +2041,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
*
* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
@@ -4025,6 +4029,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
* The LHS matrix is NOT reshaped
* The RHS matrix is NOT reshaped
+ * @note This kernel is duplicated in /experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl
*
* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index cbc4caf5f6..c47cf8ef11 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -272,6 +272,7 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "gemm_mv", "common/gemv.cl" },
{ "gemm_mv_quantized", "common/gemv.cl" },
{ "gemm_mm_native", "common/gemm.cl" },
+ { "gemm_mm_native_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
{ "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
@@ -284,6 +285,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
{ "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
+ { "gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
+ { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
+ { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
{ "gemm_lc_vm_f32", "common/gemm.cl" },
{ "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" },
{ "gemm_reshape_lhs_matrix_t", "common/gemm.cl" },
@@ -585,10 +590,18 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/common/gemm.clembed"
},
{
+ "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
+#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
+ },
+ {
"common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl",
#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.clembed"
},
{
+ "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl",
+#include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.clembed"
+ },
+ {
"common/gemmlowp.cl",
#include "./cl_kernels/common/gemmlowp.clembed"
},
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index e389ce5b0c..7ad3d55fe0 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -33,6 +33,8 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/experimental/PostOp.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -49,6 +51,17 @@ namespace
{
using ElementsProcessed = Steps;
+const auto post_op_utils = experimental::PostOpCLKernelUtils(
+{
+ // PostOp sequence -> {Kernel Postfix, PostOp Slots}
+ { {}, { "", {} } },
+ { { experimental::PostOpType::Activation }, { "", { 1 } } },
+ { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
+ { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
+ { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
+ { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
+});
+
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info)
@@ -68,6 +81,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -110,6 +124,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -170,16 +185,17 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
// dst tensor auto initialization if not yet initialized
auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
auto padding_info = get_padding_info({ src0, src1, src2, dst });
_reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
_reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
_add_bias = src2 != nullptr;
+ _num_post_op_args = gemm_info.post_ops.total_num_arguments();
// In case both input and dst have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -237,11 +253,20 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+ // If post_ops are used, then we disable the use of gemm_info.activation_info
+ if(gemm_info.post_ops.size() > 0)
+ {
+ post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
+ }
+ else
+ {
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+ }
std::string kernel_name("gemm_mm_native");
+ post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -323,11 +348,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
unsigned int idx0;
if(_add_bias)
{
- idx0 = 4 * num_arguments_per_2D_tensor() + 4;
+ idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (4 + _num_post_op_args);
}
else
{
- idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (3 + _num_post_op_args);
}
const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -339,11 +364,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
unsigned int idx0;
if(_add_bias)
{
- idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
+ idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
}
else
{
- idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
}
const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -367,6 +392,12 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
add_2D_tensor_argument(idx, src2, slice);
}
add_2D_tensor_argument(idx, dst, slice);
+ // post op argument buffers
+ for(size_t i = 0; i < _num_post_op_args; ++i)
+ {
+ const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
+ add_2D_tensor_argument(idx, post_op_arg, slice);
+ }
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
if(_add_bias)
@@ -374,6 +405,12 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
}
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+ // post op argument stride_z
+ for(size_t i = 0; i < _num_post_op_args; ++i)
+ {
+ const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
+ }
enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 89837cc515..415eb7bf3b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -76,11 +76,12 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_input_as_3d{ false };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_input_as_3d{ false };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
};
} // namespace kernels
} // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index 04c1cd66c9..260ed134e4 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/CL/CLUtils.h"
#include "src/core/CL/CLValidate.h"
+#include "src/core/experimental/PostOp.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/helpers/float_ops.h"
@@ -44,6 +45,17 @@ namespace
{
using ElementsProcessed = Steps;
+const auto post_op_utils = experimental::PostOpCLKernelUtils(
+{
+ // PostOp sequence -> {Kernel Postfix, PostOp Slots}
+ { {}, { "", {} } },
+ { { experimental::PostOpType::Activation }, { "", { 1 } } },
+ { { experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 2 } } },
+ { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add }, { "_post_act_eltwise_op_act", { 1, 2 } } },
+ { { experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 2, 3 } } },
+ { { experimental::PostOpType::Activation, experimental::PostOpType::Eltwise_Add, experimental::PostOpType::Activation }, { "_post_act_eltwise_op_act", { 1, 2, 3 } } }
+});
+
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
{
@@ -64,6 +76,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
"Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.is_post_op_sequence_supported(gemm_info.post_ops), "The sequence of Post Ops is not supported");
const unsigned int m = gemm_info.m;
const unsigned int n = gemm_info.n;
@@ -109,6 +122,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!post_op_utils.are_post_op_shapes_compliant(dst, gemm_info.post_ops), "The Post Op shapes are not compliant");
}
return Status{};
@@ -168,6 +182,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
_reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
@@ -176,9 +193,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
_add_bias = src2 != nullptr;
_export_to_cl_image = rhs_info.export_to_cl_image;
_has_pad_y = gemm_info.has_pad_y;
-
- // dst tensor auto initialization if not yet initialized
- auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+ _num_post_op_args = gemm_info.post_ops.total_num_arguments();
auto padding_info = get_padding_info({ src0, src1, src2, dst });
@@ -239,9 +254,6 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
- build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
if(_has_pad_y)
{
build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
@@ -249,10 +261,22 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
}
+ // If post_ops are used, then we disable the use of gemm_info.activation_info
+ if(gemm_info.post_ops.size() > 0)
+ {
+ post_op_utils.set_post_ops_cl_build_options(build_opts, gemm_info.post_ops);
+ }
+ else
+ {
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+ build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+ }
std::string kernel_name("gemm_mm_reshaped_only_rhs_");
kernel_name += rhs_info.transpose ? "t" : "nt";
kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+ post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
// Create kernel
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -375,6 +399,13 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
// dst buffer
add_2D_tensor_argument(idx, dst, slice);
+ // post op argument buffers
+ for(size_t i = 0; i < _num_post_op_args; ++i)
+ {
+ const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
+ add_2D_tensor_argument(idx, post_op_arg, slice);
+ }
+
// LHS stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
@@ -389,6 +420,12 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
// dst stride_z
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
+ // post op argument stride_z
+ for(size_t i = 0; i < _num_post_op_args; ++i)
+ {
+ const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
+ }
// Cross-plan padding (if _reinterpret_input_as_3d = true)
if(_reinterpret_input_as_3d && _has_pad_y)
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index cb82b4af5e..a8f0c4c3a0 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -90,13 +90,14 @@ public:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
- bool _slide_matrix_b{ true };
- bool _reinterpret_input_as_3d{ false };
- bool _reinterpret_output_as_3d{ false };
- bool _use_dummy_work_items{ false };
- bool _add_bias{ false };
- bool _export_to_cl_image{ false };
- bool _has_pad_y{ false };
+ bool _slide_matrix_b{ true };
+ bool _reinterpret_input_as_3d{ false };
+ bool _reinterpret_output_as_3d{ false };
+ bool _use_dummy_work_items{ false };
+ bool _add_bias{ false };
+ bool _export_to_cl_image{ false };
+ bool _has_pad_y{ false };
+ unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
};
} // namespace kernels
} // namespace opencl
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index e05256ee2f..50ecb214e3 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -204,7 +204,6 @@ ClGemm::ClGemm()
void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.post_ops().size() > 0, "PostOps are not supported in this kernel");
DataType data_type = a->data_type();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
@@ -223,6 +222,7 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
+ kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_native_kernel->set_target(gpu_target);
@@ -281,7 +281,6 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.post_ops().size() > 0, "PostOps are not supported in this kernel");
DataType data_type = a->data_type();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
@@ -300,6 +299,7 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
+ kernel_info.post_ops = gemm_info.post_ops();
// Set the target for the kernels
_mm_reshaped_only_rhs_kernel->set_target(gpu_target);
@@ -334,7 +334,6 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
{
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.post_ops().size() > 0, "PostOps are not supported in this kernel");
// Get the GPU target
const GPUTarget gpu_target = CLScheduler::get().target();
@@ -355,6 +354,7 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
+ kernel_info.post_ops = gemm_info.post_ops();
auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
@@ -418,7 +418,6 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
{
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.post_ops().size() > 0, "PostOps are not supported in this kernel");
TensorInfo tmp_b_info{};
@@ -441,6 +440,7 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
kernel_info.broadcast_bias = broadcast_bias;
kernel_info.activation_info = gemm_info.activation_info();
+ kernel_info.post_ops = gemm_info.post_ops();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -562,10 +562,9 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
void ClGemm::run(ITensorPack &tensors)
{
- const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);
- const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);
- const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);
- ITensor *dst = tensors.get_tensor(ACL_DST);
+ const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);
+ const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *dst = tensors.get_tensor(ACL_DST);
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);
@@ -620,7 +619,10 @@ void ClGemm::run(ITensorPack &tensors)
const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
- ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
+ // Copy original tensor pack and overwrite rhs with reshaped counterpart
+ ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
+ gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
if(has_pad_y)
{
CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);
diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h
index a6ba137b59..ac03bb02b6 100644
--- a/tests/framework/Macros.h
+++ b/tests/framework/Macros.h
@@ -49,8 +49,8 @@
#define CONCAT(ARG0, ARG1) ARG0##ARG1
-#define VARIADIC_SIZE_IMPL(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, size, ...) size
-#define VARIADIC_SIZE(...) VARIADIC_SIZE_IMPL(__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define VARIADIC_SIZE_IMPL(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, size, ...) size
+#define VARIADIC_SIZE(...) VARIADIC_SIZE_IMPL(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#define JOIN_PARAM1(OP, param) OP(0, param)
#define JOIN_PARAM2(OP, param, ...) \
@@ -92,6 +92,12 @@
#define JOIN_PARAM14(OP, param, ...) \
OP(13, param) \
, JOIN_PARAM13(OP, __VA_ARGS__)
+#define JOIN_PARAM15(OP, param, ...) \
+ OP(14, param) \
+ , JOIN_PARAM14(OP, __VA_ARGS__)
+#define JOIN_PARAM16(OP, param, ...) \
+ OP(15, param) \
+ , JOIN_PARAM15(OP, __VA_ARGS__)
#define JOIN_PARAM(OP, NUM, ...) \
CONCAT(JOIN_PARAM, NUM) \
(OP, __VA_ARGS__)
diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index dc5fbc36ba..e3f151a2ca 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
@@ -53,6 +53,11 @@ using CLGEMMMatrixMultiplyNative = CLSynthetizeOperator<ClGemmMatrixMultiplyNati
template <typename T>
using CLGEMMMatrixMultiplyNativeFixture = GEMMMatrixMultiplyNativeValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
+// Fixture for CLGEMMMatrixMultiplyNative with post ops
+template <typename T>
+using CLGEMMMatrixMultiplyNativeWithPostOpsFixture =
+ GEMMMatrixMultiplyNativeWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
+
// Fixture for CLGEMMMatrixMultiplyNative3D
template <typename T>
using CLGEMMMatrixMultiplyNative3DFixture = GEMMMatrixMultiplyNative3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMMatrixMultiplyNative>;
@@ -141,6 +146,80 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com
broadcast_bias_values),
framework::dataset::make("Activation", ActivationLayerInfo()));
+/** Post Ops */
+using PostOpArgBroadcast = CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>::PostOpArgBroadcast;
+experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
+{
+ experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+ std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2
+ 0,
+ ConvertPolicy::SATURATE);
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+ return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
+{
+ experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+ std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2
+ 1,
+ ConvertPolicy::SATURATE);
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+ return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
+{
+ experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+ // post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+ std::make_tuple(false, false, false), // If broadcast in dims 0, 1 and 2
+ 1,
+ ConvertPolicy::SATURATE);
+ return post_ops;
+}
+
+/** Different Post Op Lists */
+const auto post_op_lists = framework::dataset::make("post_op_lists", {
+ post_ops_1(),
+ post_ops_2(),
+ post_ops_3(),
+} );
+
+bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
+{
+ const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
+ const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
+
+ // Create TensorInfo for post op arguments
+ TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
+ TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
+ TensorInfo input2_info(TensorShape(n), 1, data_type);
+ TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
+
+ GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+ false /**< reinterpret the input as 3D */,
+ true /**< Flag used to broadcast the bias addition */,
+ false /**< wider accumm */,
+ false /**< has pad y */,
+ ActivationLayerInfo::ActivationFunction::IDENTITY,
+ 1 /**< Multiplication factor for the width of the 1xW transposed block */,
+ 1 /**< Multiplication factor for the height of the 4x4 interleaved block */,
+ lhs_info,
+ rhs_info,
+ 0 /**< Offset to be added to each element of the matrix A */,
+ 0 /**< Offset to be added to each element of the matrix B */,
+ post_ops);
+ return bool(ClGemmMatrixMultiplyNativeKernel::validate(&input0_info.clone()->set_is_resizable(true),
+ &input1_info.clone()->set_is_resizable(true),
+ &input2_info.clone()->set_is_resizable(true),
+ &output_info.clone()->set_is_resizable(true),1.f,1.f,
+ lhs_info,
+ rhs_info,
+ gemm_info));
+}
+
/** Configuration test */
void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
{
@@ -191,6 +270,119 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
TEST_SUITE(CL)
TEST_SUITE(GEMMMatrixMultiplyNative)
+TEST_SUITE(ValidateFusedPostOpsConfigs)
+TEST_SUITE(Invalid)
+TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 17;
+ const unsigned int n = 1;
+ const unsigned int k = 13;
+ const unsigned int batch = 2;
+ TensorShape post_op_arg0_shape(n, m, batch);
+ TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
+ auto post_op_arg1_info = post_op_arg_info.clone();
+
+ // Unsupported sequence of post ops
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
+ &post_op_arg_info,
+ 1,
+ ConvertPolicy::SATURATE);
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
+ post_op_arg1_info.get(),
+ 0,
+ ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
+}
+TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
+{
+ // Invalid broadcast: post op tensors "widen" the output tensor
+ const auto data_type = DataType::F32;
+ const unsigned int m = 1;
+ const unsigned int n = 18;
+ const unsigned int k = 13;
+ const unsigned int batch = 2;
+ TensorShape post_op_arg_shape(n, m + 1, batch); // output's Y dimension (m) is "widened", which is not allowed
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
+{
+ // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(1, m, batch);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Invalid
+TEST_SUITE(Valid)
+TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(n, 1, batch);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(1, 1, batch);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(1, 1, 1);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Valid
+TEST_SUITE_END() // ValidateFusedPostOps
TEST_SUITE(Float)
TEST_SUITE(FP32)
DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
@@ -323,6 +515,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyNative3DFixture<float>, f
// Validate output
validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
}
+
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeWithPostOpsFixture<float>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ m_values,
+ n_values),
+ k_values),
+ b_values),
+ framework::dataset::make("M0", { 4 })),
+ n0_values_precommit),
+ k0_values_precommit),
+ framework::dataset::make("DataType", DataType::F32)),
+ framework::dataset::make("alpha", {1.0f} )),
+ framework::dataset::make("beta", {1.0f} )),
+ framework::dataset::make("broadcast_bias", { false, true } )),
+ framework::dataset::make("Activation", { ActivationLayerInfo() })),
+ post_op_lists)
+ )
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE_END() // FusedPostOps
+
TEST_SUITE_END() // FP32
TEST_SUITE_END() // Float
TEST_SUITE_END() // GEMMMatrixMulipltyNative
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index 0f86a70e0f..9e1a185a10 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/experimental/PostOp.h"
#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
#include "tests/CL/CLAccessor.h"
@@ -61,6 +62,11 @@ using CLGEMMMatrixMultiplyReshapedOnlyRHSFixture = GEMMMatrixMultiplyReshapedOnl
template <typename T>
using CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture = GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
+// Fixture for CLGEMMMatrixMultiplyReshapedOnlyRHS with post ops
+template <typename T>
+using CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture =
+ GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshapedOnlyRHS>;
+
namespace
{
// *INDENT-OFF*
@@ -157,6 +163,81 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com
broadcast_bias_values),
framework::dataset::make("Activation", ActivationLayerInfo()));
+/** Post Ops */
+using PostOpArgBroadcast = CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>::PostOpArgBroadcast;
+experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
+{
+ experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+ std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2
+ 0,
+ ConvertPolicy::SATURATE);
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+ return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
+{
+ experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+ std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2
+ 1,
+ ConvertPolicy::SATURATE);
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+ return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
+{
+ experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+ post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+ std::make_tuple(false, false, true), // If broadcast in dims 0, 1 and 2
+ 1,
+ ConvertPolicy::SATURATE);
+ return post_ops;
+}
+
+/** Different Post Op Lists */
+const auto post_op_lists = framework::dataset::make("post_op_lists", {
+ post_ops_1(),
+ post_ops_2(),
+ post_ops_3(),
+ } );
+
+ bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList<ITensorInfo*>& post_ops)
+{
+ const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true);
+ const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false);
+
+ // Create TensorInfo for post op arguments
+ TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type);
+ TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type);
+ TensorInfo input2_info(TensorShape(n), 1, data_type);
+ TensorInfo output_info(TensorShape(n, m, batch), 1, data_type);
+
+ const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info));
+
+ GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+ false /**< reinterpret the input as 3D */,
+ true /**< Flag used to broadcast the bias addition */,
+ false /**< wider accumm */,
+ false /**< has pad y */,
+ ActivationLayerInfo::ActivationFunction::IDENTITY,
+ 1 /**< Multiplication factor for the width of the 1xW transposed block */,
+ 1 /**< Multiplication factor for the height of the 4x4 interleaved block */,
+ lhs_info,
+ rhs_info,
+ 0 /**< Offset to be added to each element of the matrix A */,
+ 0 /**< Offset to be added to each element of the matrix B */,
+ post_ops);
+ return bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(&input0_info.clone()->set_is_resizable(true),
+ &reshaped_input1_info.clone()->set_is_resizable(true),
+ &input2_info.clone()->set_is_resizable(true),
+ &output_info.clone()->set_is_resizable(true),1.f,1.f,
+ lhs_info,
+ rhs_info,
+ gemm_info));
+}
/** Configuration test */
bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
@@ -211,6 +292,7 @@ bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
CLGEMMMatrixMultiplyReshapedOnlyRHS gemm;
return bool(gemm.validate(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info));
}
+
} // namespace
TEST_SUITE(CL)
@@ -262,6 +344,119 @@ b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output
ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS);
}
+TEST_SUITE(ValidateFusedPostOpsConfigs)
+TEST_SUITE(Invalid)
+TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 17;
+ const unsigned int n = 1;
+ const unsigned int k = 13;
+ const unsigned int batch = 2;
+ TensorShape post_op_arg0_shape(n, m, batch);
+ TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type);
+ auto post_op_arg1_info = post_op_arg_info.clone();
+
+ // Unsupported sequence of post ops
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
+ &post_op_arg_info,
+ 1,
+ ConvertPolicy::SATURATE);
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>(
+ post_op_arg1_info.get(),
+ 0,
+ ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
+}
+TEST_CASE(OutputWidened, framework::DatasetMode::ALL)
+{
+ // Invalid broadcast: post op tensors "widen" the output tensor
+ const auto data_type = DataType::F32;
+ const unsigned int m = 17;
+ const unsigned int n = 1;
+ const unsigned int k = 1;
+ const unsigned int batch = 1;
+ TensorShape post_op_arg_shape(n, m, batch + 4); // output's batch dimension is "widened", which is not allowed
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL)
+{
+ // Invalid broadcast: post op tensors broadcast in the first dimension (X) only
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(1, m, batch);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Invalid
+TEST_SUITE(Valid)
+TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(n, 1, batch);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(1, 1, batch);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const unsigned int m = 22;
+ const unsigned int n = 16;
+ const unsigned int k = 15;
+ const unsigned int batch = 3;
+ TensorShape post_op_arg_shape(1, 1, 1);
+ TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type);
+ experimental::PostOpList<ITensorInfo*> post_ops{};
+ post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo*>>( &post_op_arg_info, 0, ConvertPolicy::SATURATE);
+
+ ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Valid
+TEST_SUITE_END() // ValidateFusedPostOps
TEST_SUITE(Float)
TEST_SUITE(FP32)
@@ -462,6 +657,44 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
framework::ARM_COMPUTE_PRINT_INFO();
}
}
+
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<float>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ m_values,
+ n_values),
+ k_values),
+ b_values),
+ m0_values_precommit),
+ n0_values_precommit),
+ k0_values_precommit),
+ framework::dataset::make("H0", {1})),
+ framework::dataset::make("interleave_rhs", { true })),
+ t_values_rhs),
+ framework::dataset::make("export_to_cl_image_rhs", false, true)),
+ framework::dataset::make("DataType", DataType::F32)),
+ a_values),
+ beta_values),
+ framework::dataset::make("broadcast_bias", { false } )),
+ act_values),
+ post_op_lists)
+ )
+{
+ // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+ if(validate_result)
+ {
+ validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+ }
+ else
+ {
+ ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+ framework::ARM_COMPUTE_PRINT_INFO();
+ }
+}
+
+TEST_SUITE_END() // FusedPostOps
+
TEST_SUITE_END() // FP32
TEST_SUITE(FP16)
@@ -590,6 +823,43 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
framework::ARM_COMPUTE_PRINT_INFO();
}
}
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+ m_values,
+ n_values),
+ k_values),
+ b_values),
+ m0_values_precommit),
+ n0_values_precommit),
+ k0_values_precommit),
+ framework::dataset::make("H0", {1})),
+ framework::dataset::make("interleave_rhs", { true })),
+ t_values_rhs),
+ framework::dataset::make("export_to_cl_image_rhs", true)),
+ framework::dataset::make("DataType", DataType::F16)),
+ a_values),
+ beta_values),
+ framework::dataset::make("broadcast_bias", { false } )),
+ act_values),
+ post_op_lists)
+ )
+{
+ // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+ if(validate_result)
+ {
+ validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+ }
+ else
+ {
+ ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+ framework::ARM_COMPUTE_PRINT_INFO();
+ }
+}
+
+TEST_SUITE_END() // FusedPostOps
+
TEST_SUITE_END() // FP16
TEST_SUITE_END() // Float
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index e1191587d5..fa273018a4 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -1522,6 +1522,243 @@ protected:
SimpleTensor<T> _reference{};
};
+/** (EXPERIMENTAL_POST_OPS)*/
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
+class GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture : public framework::Fixture
+{
+public:
+ using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
+ template <typename...>
+ void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
+ bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info,
+ const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
+ {
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = m0;
+ lhs_info.k0 = k0;
+
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = n0;
+ rhs_info.k0 = k0;
+ rhs_info.h0 = h0;
+ rhs_info.interleave = interleave_rhs;
+ rhs_info.transpose = transpose_rhs;
+ rhs_info.export_to_cl_image = export_to_cl_image;
+
+ // Set the tensor shapes for LHS and RHS matrices
+ const TensorShape lhs_shape(k, m, batch_size);
+ const TensorShape rhs_shape(n, k, batch_size);
+ const TensorShape bias_shape(n,
+ broadcast_bias ? 1 : m,
+ broadcast_bias ? 1 : batch_size);
+ auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
+ [ = ](auto broadcast)
+ {
+ return TensorShape
+ {
+ std::get<0>(broadcast) ? 1 : n,
+ std::get<1>(broadcast) ? 1 : m,
+ std::get<2>(broadcast) ? 1 : batch_size,
+ };
+ });
+
+ _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+ if(validate_result)
+ {
+ _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+ }
+ }
+
+protected:
+ template <typename U>
+ void fill(U &&tensor, int i)
+ {
+ static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+ using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+ DistributionType distribution{ T(-1.0f), T(1.0f) };
+ library->fill(tensor, distribution, i);
+
+ // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+ DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+ library->fill_borders_with_garbage(tensor, distribution_inf, i);
+ }
+
+ TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+ {
+ // Create tensors
+ TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
+ TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
+ TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+ TensorType rhs_reshaped;
+ TensorType dst;
+ // Create post op tensors and populate post op with them
+ std::vector<TensorType> post_op_tensors_holder{};
+ auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
+ [&post_op_tensors_holder, &data_type](auto shape)
+ {
+ auto t = create_tensor<TensorType>(shape, data_type, 1);
+ post_op_tensors_holder.push_back(std::move(t));
+ return post_op_tensors_holder.back().info();
+ });
+
+ const unsigned int M = lhs_shape[1];
+ const unsigned int N = rhs_shape[0];
+ const unsigned int K = lhs_shape[0];
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = M;
+ kernel_info.n = N;
+ kernel_info.k = K;
+ kernel_info.depth_output_gemm3d = 0;
+ kernel_info.reinterpret_input_as_3d = false;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = act_info;
+ kernel_info.post_ops = populated_post_ops;
+
+ // The output tensor will be auto-initialized within the function
+
+ // Create and configure function
+ ReshapeRHSOperatorType reshape_rhs;
+ GEMMOperatorType gemm;
+
+ validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+ validate_result = validate_result || !rhs_info.export_to_cl_image;
+ if(!validate_result)
+ {
+ return nullptr;
+ }
+
+ reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+ gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+
+ ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+ for(const auto &tensor : post_op_tensors_holder)
+ {
+ ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
+ }
+
+ // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+ if(!rhs_info.export_to_cl_image)
+ {
+ add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst });
+ for(auto &tensor : post_op_tensors_holder)
+ {
+ add_padding_x({ &tensor });
+ }
+ }
+
+ // Allocate tensors
+ lhs.allocator()->allocate();
+ rhs.allocator()->allocate();
+ rhs_reshaped.allocator()->allocate();
+ bias.allocator()->allocate();
+ dst.allocator()->allocate();
+ for(auto &tensor : post_op_tensors_holder)
+ {
+ tensor.allocator()->allocate();
+ }
+
+ ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+ for(const auto &tensor : post_op_tensors_holder)
+ {
+ ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
+ }
+
+ // Fill tensors
+ fill(AccessorType(lhs), 0);
+ fill(AccessorType(rhs), 1);
+ fill(AccessorType(bias), 2);
+ for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+ {
+ fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
+ }
+
+ // Compute GEMM
+ ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+ reshape_rhs.run(reshape_rhs_pack);
+ ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+ { ACL_SRC_1, &rhs_reshaped },
+ { ACL_SRC_2, &bias },
+ { ACL_DST, &dst }
+ });
+ for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+ {
+ gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
+ }
+ gemm.run(gemm_pack);
+
+ return dst;
+ }
+
+ SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
+ const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+ {
+ TensorShape dst_shape = lhs_shape;
+ dst_shape[0] = rhs_shape[0];
+ dst_shape[1] = lhs_shape[1];
+
+ // Create reference
+ SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
+ SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
+ SimpleTensor<T> bias{ dst_shape, data_type, 1 };
+ // Create post op tensors and populate post op with them
+ auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
+ {
+ return SimpleTensor<T> { shape, data_type, 1 };
+ });
+
+ const int n = rhs_shape[0];
+ const int m = lhs_shape[1];
+ const int batch_size = lhs_shape[2];
+
+ // Fill reference
+ int tensor_idx = 0;
+ fill(lhs, tensor_idx++);
+ fill(rhs, tensor_idx++);
+ fill(bias, tensor_idx++);
+ for(auto &op : populated_post_ops.get_list())
+ {
+ for(auto tensor : op->arguments())
+ {
+ fill(*tensor, tensor_idx++);
+ }
+ }
+
+ if(broadcast_bias)
+ {
+ // In case of broadcast, we need simply copy the first into the following "M" ones
+ for(int i = 1; i < m * batch_size; i++)
+ {
+ memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
+ }
+ }
+
+ SimpleTensor<T> out;
+ out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
+ // Ignore activation info if post ops are used instead
+ if(populated_post_ops.size() > 0)
+ {
+ out = reference::post_ops<T>(out, populated_post_ops);
+ }
+ else
+ {
+ out = reference::activation_layer(out, act_info);
+ }
+ return out;
+ }
+
+ bool validate_result = true;
+ TensorType _target{};
+ SimpleTensor<T> _reference{};
+};
+
template <typename TensorType, typename AccessorType, typename T, typename ReshapeRHSOperatorType, typename GEMMOperatorType>
class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture
{
@@ -1830,6 +2067,213 @@ protected:
};
template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
+class GEMMMatrixMultiplyNativeWithPostOpsValidationFixture : public framework::Fixture
+{
+public:
+ using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
+public:
+ template <typename...>
+ void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta, bool broadcast_bias,
+ const ActivationLayerInfo &act_info, const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
+ {
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = m0;
+ lhs_info.k0 = k0;
+
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = n0;
+ rhs_info.k0 = k0;
+
+ // Set the tensor shapes for LHS and RHS matrices
+ const TensorShape lhs_shape(k, m, batch_size);
+ const TensorShape rhs_shape(n, k, batch_size);
+ const TensorShape bias_shape(n,
+ broadcast_bias ? 1 : m,
+ broadcast_bias ? 1 : batch_size);
+ const auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
+ [ = ](auto broadcast)
+ {
+ return TensorShape
+ {
+ std::get<0>(broadcast) ? 1 : n,
+ std::get<1>(broadcast) ? 1 : m,
+ std::get<2>(broadcast) ? 1 : batch_size,
+ };
+ });
+
+ _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+ _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+ }
+
+protected:
+ template <typename U>
+ void fill(U &&tensor, int i)
+ {
+ static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+ using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+ DistributionType distribution{ T(-1.0f), T(1.0f) };
+ library->fill(tensor, distribution, i);
+
+ // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+ DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+ library->fill_borders_with_garbage(tensor, distribution_inf, i);
+ }
+
+ TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+ DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+ {
+ // Create tensors
+ TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
+ TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
+ TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+ TensorType dst;
+ // Create post op tensors and populate post op with them
+ std::vector<TensorType> post_op_tensors_holder{};
+ auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
+ [&post_op_tensors_holder, &data_type](auto shape)
+ {
+ auto t = create_tensor<TensorType>(shape, data_type, 1);
+ post_op_tensors_holder.push_back(std::move(t));
+ return post_op_tensors_holder.back().info();
+ });
+
+ const unsigned int M = lhs_shape[1];
+ const unsigned int N = rhs_shape[0];
+ const unsigned int K = lhs_shape[0];
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = M;
+ kernel_info.n = N;
+ kernel_info.k = K;
+ kernel_info.depth_output_gemm3d = 0;
+ kernel_info.reinterpret_input_as_3d = false;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = act_info;
+ kernel_info.post_ops = populated_post_ops;
+
+ // Create and configure function
+ GEMMOperatorType gemm;
+ gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+
+ ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+ for(const auto &tensor : post_op_tensors_holder)
+ {
+ ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
+ }
+
+ add_padding_x({ &lhs, &rhs, &bias, &dst });
+ for(auto &tensor : post_op_tensors_holder)
+ {
+ add_padding_x({ &tensor });
+ }
+
+ // Allocate tensors
+ lhs.allocator()->allocate();
+ rhs.allocator()->allocate();
+ bias.allocator()->allocate();
+ dst.allocator()->allocate();
+ for(auto &tensor : post_op_tensors_holder)
+ {
+ tensor.allocator()->allocate();
+ }
+
+ ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+ for(const auto &tensor : post_op_tensors_holder)
+ {
+ ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
+ }
+
+ // Fill tensors
+ fill(AccessorType(lhs), 0);
+ fill(AccessorType(rhs), 1);
+ fill(AccessorType(bias), 2);
+ for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+ {
+ fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
+ }
+
+ // Compute GEMM
+ ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
+ { ACL_SRC_1, &rhs },
+ { ACL_SRC_2, &bias },
+ { ACL_DST, &dst }
+ });
+ for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+ {
+ gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
+ }
+ gemm.run(gemm_pack);
+
+ return dst;
+ }
+
+ SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
+ const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+ {
+ TensorShape dst_shape = lhs_shape;
+ dst_shape[0] = rhs_shape[0];
+ dst_shape[1] = lhs_shape[1];
+
+ // Create reference
+ SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
+ SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
+ SimpleTensor<T> bias{ dst_shape, data_type, 1 };
+ // Create post op tensors and populate post op with them
+ auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
+ {
+ return SimpleTensor<T> { shape, data_type, 1 };
+ });
+
+ const int n = rhs_shape[0];
+ const int m = lhs_shape[1];
+ const int batch_size = lhs_shape[2];
+
+ // Fill reference
+ int tensor_idx = 0;
+ fill(lhs, tensor_idx++);
+ fill(rhs, tensor_idx++);
+ fill(bias, tensor_idx++);
+ for(auto &op : populated_post_ops.get_list())
+ {
+ for(auto tensor : op->arguments())
+ {
+ fill(*tensor, tensor_idx++);
+ }
+ }
+
+ if(broadcast_bias)
+ {
+ // In case of broadcast, we need simply copy the first into the following "M" ones
+ for(int i = 1; i < m * batch_size; i++)
+ {
+ memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
+ }
+ }
+
+ SimpleTensor<T> out;
+ out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
+ // Ignore activation info if post ops are used instead
+ if(populated_post_ops.size() > 0)
+ {
+ out = reference::post_ops<T>(out, populated_post_ops);
+ }
+ else
+ {
+ out = reference::activation_layer(out, act_info);
+ }
+ return out;
+ }
+
+ TensorType _target{};
+ SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
class GEMMMatrixMultiplyNative3DValidationFixture : public framework::Fixture
{
public: