aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl')
-rw-r--r--src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl140
1 files changed, 84 insertions, 56 deletions
diff --git a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
index e96aba613b..7f4ad814fb 100644
--- a/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
+++ b/src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl
@@ -26,7 +26,7 @@
#include "repeat.h"
/** (EXPERIMENTAL_POST_OPS) gemm_mm_reshaped_only_rhs kernel */
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
#if defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
#define CONCAT(a, b) a##b
@@ -151,6 +151,7 @@
#error "N0 value not supported"
#endif // N0 conditions
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
* Post op 1: activation (optional)
* Post op 2: elementwise op
@@ -194,7 +195,10 @@ __kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARAT
,
uint dst_cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
- )
+ ,
+ const int M,
+ const int N,
+ const int K)
{
// Block size
#define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -409,8 +413,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARAT
#undef RHS_OFFSET_X
#undef RHS_STEP_X
}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T_POST_ACT_ELTWISE_OP_ACT)
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
* Post op 1: activation (optional)
* Post op 2: elementwise op
@@ -430,6 +435,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act(IMAGE_DECLARAT
* @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
* @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M Number of rows in LHS matrix not reshaped.
+ * @param[in] N Number of columns in RHS matrix not reshaped.
+ * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
*/
__kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
__read_only image2d_t rhs_img,
@@ -454,12 +462,15 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
,
uint dst_cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
- )
+ ,
+ const int M,
+ const int N,
+ const int K)
{
// Pixel unit
#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-#define LEFTOVER_K (K % K0)
+ const uint LEFTOVER_K = K % K0;
// Block size
#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
@@ -562,99 +573,99 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
}
-#if LEFTOVER_K != 0
- // Note: We cannot read out-of-bound elements from the RHS matrix because
- // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
- union UNION_VEC_TYPE
+ if(LEFTOVER_K != 0)
{
- DATA_TYPE s[K0];
- VEC_DATA_TYPE(DATA_TYPE, K0)
- v;
- };
+ // Note: We cannot read out-of-bound elements from the RHS matrix because
+ // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+
+ union UNION_VEC_TYPE
+ {
+ DATA_TYPE s[K0];
+ VEC_DATA_TYPE(DATA_TYPE, K0)
+ v;
+ };
- union UNION_VEC_TYPE a0 = {.v = 0 };
+ union UNION_VEC_TYPE a0 = {.v = 0 };
#if M0 > 1
- union UNION_VEC_TYPE a1 = {.v = 0 };
+ union UNION_VEC_TYPE a1 = {.v = 0 };
#endif // M0 > 1
#if M0 > 2
- union UNION_VEC_TYPE a2 = {.v = 0 };
+ union UNION_VEC_TYPE a2 = {.v = 0 };
#endif // M0 > 2
#if M0 > 3
- union UNION_VEC_TYPE a3 = {.v = 0 };
+ union UNION_VEC_TYPE a3 = {.v = 0 };
#endif // M0 > 3
#if M0 > 4
- union UNION_VEC_TYPE a4 = {.v = 0 };
+ union UNION_VEC_TYPE a4 = {.v = 0 };
#endif // M0 > 4
#if M0 > 5
- union UNION_VEC_TYPE a5 = {.v = 0 };
+ union UNION_VEC_TYPE a5 = {.v = 0 };
#endif // M0 > 5
#if M0 > 6
- union UNION_VEC_TYPE a6 = {.v = 0 };
+ union UNION_VEC_TYPE a6 = {.v = 0 };
#endif // M0 > 6
#if M0 > 7
- union UNION_VEC_TYPE a7 = {.v = 0 };
+ union UNION_VEC_TYPE a7 = {.v = 0 };
#endif // M0 > 7
- REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+ REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
- // Load from RHS matrix
- LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+ // Load from RHS matrix
+ LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
- // Load from LHS matrix
- for(int k = 0; k < LEFTOVER_K; ++k)
- {
- a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+ // Load from LHS matrix
+ for(int k = 0; k < LEFTOVER_K; ++k)
+ {
+ a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
#if M0 > 1
- a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+ a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
#endif // M0 > 1
#if M0 > 2
- a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+ a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
#endif // M0 > 2
#if M0 > 3
- a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+ a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
#endif // M0 > 3
#if M0 > 4
- a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+ a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
#endif // M0 > 4
#if M0 > 5
- a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+ a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
#endif // M0 > 5
#if M0 > 6
- a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+ a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
#endif // M0 > 6
#if M0 > 7
- a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+ a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
#endif // M0 > 7
- lhs_offset += sizeof(DATA_TYPE);
- }
+ lhs_offset += sizeof(DATA_TYPE);
+ }
- // Accumulate
- ARM_DOT_K0XN0(K0, a0.v, b, c0);
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0.v, b, c0);
#if M0 > 1
- ARM_DOT_K0XN0(K0, a1.v, b, c1);
+ ARM_DOT_K0XN0(K0, a1.v, b, c1);
#endif // M0 > 1
#if M0 > 2
- ARM_DOT_K0XN0(K0, a2.v, b, c2);
+ ARM_DOT_K0XN0(K0, a2.v, b, c2);
#endif // M0 > 2
#if M0 > 3
- ARM_DOT_K0XN0(K0, a3.v, b, c3);
+ ARM_DOT_K0XN0(K0, a3.v, b, c3);
#endif // M0 > 3
#if M0 > 4
- ARM_DOT_K0XN0(K0, a4.v, b, c4);
+ ARM_DOT_K0XN0(K0, a4.v, b, c4);
#endif // M0 > 4
#if M0 > 5
- ARM_DOT_K0XN0(K0, a5.v, b, c5);
+ ARM_DOT_K0XN0(K0, a5.v, b, c5);
#endif // M0 > 5
#if M0 > 6
- ARM_DOT_K0XN0(K0, a6.v, b, c6);
+ ARM_DOT_K0XN0(K0, a6.v, b, c6);
#endif // M0 > 6
#if M0 > 7
- ARM_DOT_K0XN0(K0, a7.v, b, c7);
+ ARM_DOT_K0XN0(K0, a7.v, b, c7);
#endif // M0 > 7
-
-#endif // LEFTOVER_K != 0
+ }
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
@@ -723,10 +734,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
#undef RHS_BLOCK_SIZE
#undef RHS_OFFSET_X
#undef RHS_STEP_X
-#undef LEFTOVER_K
#undef PIXEL_UNIT
}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
#define VFMA(a, b, c) \
({ \
@@ -805,6 +815,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
#error "M0 not supported"
#endif // M0 not supported
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops:
* Post op 1: activation (optional)
* Post op 2: elementwise op
@@ -824,6 +835,9 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act(IMAGE_
* @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
* @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M Number of rows in LHS matrix not reshaped.
+ * @param[in] N Number of columns in RHS matrix not reshaped.
+ * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
*/
__kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
IMAGE_DECLARATION(rhs),
@@ -848,7 +862,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARA
,
uint dst_cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
- )
+ ,
+ const int M,
+ const int N,
+ const int K)
{
// Block size
#define RHS_BLOCK_SIZE ((K0) * (N0))
@@ -1087,9 +1104,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARA
#undef RHS_BLOCK_SIZE
#undef RHS_OFFSET_X
#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_POST_ACT_ELTWISE_OP_ACT)
-#if defined(OPENCL_IMAGE_SUPPORT)
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
/** This OpenCL kernel computes the matrix multiplication between 2 matrices plus 3 post ops. The RHS matrix is stored in OpenCL image object.
* Post op 1: activation (optional)
* Post op 2: elementwise op
@@ -1109,6 +1128,9 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_post_act_eltwise_op_act(IMAGE_DECLARA
* @param[in] eltwise_operand_stride_y Stride of the eltwise operand matrix in Y dimension (in bytes)
* @param[in] eltwise_operand_step_y eltwise_operand_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] eltwise_operand_stride_z Stride of the eltwise operand tensor in Z dimension (in bytes)
+ * @param[in] M Number of rows in LHS matrix not reshaped.
+ * @param[in] N Number of columns in RHS matrix not reshaped.
+ * @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.
*/
__kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE_DECLARATION(lhs),
__read_only image2d_t rhs_img,
@@ -1133,7 +1155,10 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE
,
uint dst_cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
- )
+ ,
+ const int M,
+ const int N,
+ const int K)
{
// Pixel unit
#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
@@ -1145,9 +1170,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE
#if defined(RHS_INTERLEAVE)
#define RHS_OFFSET_X (PIXEL_UNIT)
#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#define RHS_STEP_LOOP (1)
#else // defined(RHS_INTERLEAVE)
#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
#define RHS_STEP_X (PIXEL_UNIT)
+#define RHS_STEP_LOOP (H0)
#endif // defined(RHS_INTERLEAVE)
uint x = get_global_id(0);
@@ -1365,7 +1392,8 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture_post_act_eltwise_op_act(IMAGE
#undef RHS_BLOCK_SIZE
#undef RHS_OFFSET_X
#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE_POST_ACT_ELTWISE_OP_ACT)
#endif // defined(P2_ELTWISE_OP) && defined(P2_ELTWISE_ARG1_HEIGHT) && defined(P2_ELTWISE_ARG1_WIDTH)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K) \ No newline at end of file
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)