From 17b0f8ba60ec9db4b96471f9406843bee6a43a4f Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Tue, 8 Jan 2019 12:17:03 +0000 Subject: COMPMID-1837 : Implement REPEAT utility macro on OpenCL Change-Id: I2b0dbfe7d430a8d0f62eb906f0334b16cde9e45b Reviewed-on: https://review.mlplatform.org/457 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- src/core/CL/cl_kernels/gemm.cl | 157 +++------------------------------------- src/core/CL/cl_kernels/repeat.h | 83 +++++++++++++++++++++ 2 files changed, 93 insertions(+), 147 deletions(-) create mode 100644 src/core/CL/cl_kernels/repeat.h diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 7a861dd207..9dd072bd6e 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "helpers.h" +#include "repeat.h" #if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) @@ -99,14 +100,7 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)); - uint zin0 = 0; - uint zin1 = 0; - uint zin2 = 0; - uint zin3 = 0; - uint zin4 = 0; - uint zin5 = 0; - uint zin6 = 0; - uint zin7 = 0; + REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0); //uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0; #if defined(REINTERPRET_INPUT_AS_3D) // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we @@ -164,7 +158,7 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6); zin6 *= (cross_plane_pad * src_stride_y); #endif // M0 > 6 -#if M0 > 6 +#if M0 > 7 zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D; zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7); zin7 *= (cross_plane_pad * src_stride_y); @@ -609,38 +603,7 @@ __kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), // ---------------------------Load input values -------------------------------- - VEC_DATA_TYPE(DATA_TYPE, N0) - a0 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a1 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a2 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a3 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a4 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a5 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a6 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a7 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a8 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a9 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aA = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aB = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aC = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aD = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aE = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aF = 0; + REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0; // Load values from the RHS matrix a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); @@ -714,7 +677,6 @@ __kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), #endif // K0 > 8 // ---------------------------Store output values ------------------------------ - VSTORE(N0) (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE))); #if K0 > 1 @@ -828,39 +790,7 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z; // ---------------------------Load input values -------------------------------- - - VEC_DATA_TYPE(DATA_TYPE, N0) - a0 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a1 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a2 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a3 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a4 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a5 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a6 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a7 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a8 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - a9 = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aA = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aB = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aC = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aD = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aE = 0; - VEC_DATA_TYPE(DATA_TYPE, N0) - aF = 0; + REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0; // Load values from the RHS matrix a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); @@ -930,39 +860,7 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), #endif // K0 > 8 // ---------------------------Transpose the block ------------------------------ - - VEC_DATA_TYPE(DATA_TYPE, K0) - res0 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res1 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res2 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res3 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res4 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res5 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res6 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res7 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res8 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - res9 = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - resA = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - resB = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - resC = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - resD = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - resE = 0; - VEC_DATA_TYPE(DATA_TYPE, K0) - resF = 0; + REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0; #if K0 == 4 // This part computes the following transpositions: @@ -1301,36 +1199,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), #endif // defined(MATRIX_B_DEPTH) // Initialize the accumulators - VEC_DATA_TYPE(DATA_TYPE, N0) - c0 = 0; -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, N0) - c1 = 0; -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, N0) - c2 = 0; -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, N0) - c3 = 0; -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, N0) - c4 = 0; -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, N0) - c5 = 0; -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, N0) - c6 = 0; -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, N0) - c7 = 0; -#endif // M0 > 7 + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; for(int i = 0; i < K; i += K0) { @@ -1442,14 +1311,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); - uint zout0 = 0; - uint zout1 = 0; - uint zout2 = 0; - uint zout3 = 0; - uint zout4 = 0; - uint zout5 = 0; - uint zout6 = 0; - uint zout7 = 0; + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; #if defined(REINTERPRET_OUTPUT_AS_3D) // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension @@ -1576,6 +1438,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7)); #endif // M0 > 7 + #undef LHS_BLOCK_SIZE #undef LHS_OFFSET_X #undef LHS_STEP_X diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h new file mode 100644 index 0000000000..691f7aea01 --- /dev/null +++ b/src/core/CL/cl_kernels/repeat.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_REPEAT_H +#define ARM_COMPUTE_REPEAT_H + +/** Macros that help in loop unrolling */ +//Repeat macros with 3 param, excluding the implicit ID param +#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) +#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ + P_X##_DEF(1, P_A, P_B, P_C); \ + REPEAT_3_1(P_X, P_A, P_B, P_C) +#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ + P_X##_DEF(2, P_A, P_B, P_C); \ + REPEAT_3_2(P_X, P_A, P_B, P_C) +#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ + P_X##_DEF(3, P_A, P_B, P_C); \ + REPEAT_3_3(P_X, P_A, P_B, P_C) +#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ + P_X##_DEF(4, P_A, P_B, P_C); \ + REPEAT_3_4(P_X, P_A, P_B, P_C) +#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ + P_X##_DEF(5, P_A, P_B, P_C); \ + REPEAT_3_5(P_X, P_A, P_B, P_C) +#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ + P_X##_DEF(6, P_A, P_B, P_C); \ + REPEAT_3_6(P_X, P_A, P_B, P_C) +#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ + P_X##_DEF(7, P_A, P_B, P_C); \ + REPEAT_3_7(P_X, P_A, P_B, P_C) +#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ + P_X##_DEF(8, P_A, P_B, P_C); \ + REPEAT_3_8(P_X, P_A, P_B, P_C) +#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ + P_X##_DEF(9, P_A, P_B, P_C); \ + REPEAT_3_9(P_X, P_A, P_B, P_C) +#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ + P_X##_DEF(A, P_A, P_B, P_C); \ + REPEAT_3_10(P_X, P_A, P_B, P_C) +#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ + P_X##_DEF(B, P_A, P_B, P_C); \ + REPEAT_3_11(P_X, P_A, P_B, P_C) +#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ + P_X##_DEF(C, P_A, P_B, P_C); \ + REPEAT_3_12(P_X, P_A, P_B, P_C) +#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ + P_X##_DEF(D, P_A, P_B, P_C); \ + REPEAT_3_13(P_X, P_A, P_B, P_C) +#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ + P_X##_DEF(E, P_A, P_B, P_C); \ + REPEAT_3_14(P_X, P_A, P_B, P_C) +#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ + P_X##_DEF(F, P_A, P_B, P_C); \ + REPEAT_3_15(P_X, P_A, P_B, P_C) + +#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) + +//Macro for initializing N variables. generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) +#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL +#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) + +#endif // ARM_COMPUTE_REPEAT_H -- cgit v1.2.1