From b1fcefddf3f59219a9d7930d607175b7e6c39347 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Wed, 15 Jun 2022 19:02:28 +0100 Subject: Implement new Elementwise Dynamic Fusion Operators: Div, Floor Resolves: COMPMID-5355 Change-Id: I92f73fbe885f28bbe7b07965b90cfd807c93602f Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7745 Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: SiCong Li --- src/core/CL/cl_kernels/common/gemmlowp.cl | 4 +- src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 4 +- .../CL/cl_kernels/nhwc/direct_convolution3d.cl | 4 +- src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl | 4 +- .../cl_kernels/nhwc/dwc_native_quantized_nhwc.cl | 4 +- .../cl_kernels/nhwc/winograd_output_transform.cl | 12 +++--- src/core/CL/cl_kernels/tile_helpers.h | 46 ++++++++++++++++++---- 7 files changed, 54 insertions(+), 24 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl index 9fad66df68..53ce296948 100644 --- a/src/core/CL/cl_kernels/common/gemmlowp.cl +++ b/src/core/CL/cl_kernels/common/gemmlowp.cl @@ -703,7 +703,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t a_offset_s32[0].v *= A_OFFSET; - T_ADD_BROADCAST_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32); + T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32); #endif // defined(A_OFFSET) #if defined(B_OFFSET) @@ -728,7 +728,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias); - T_ADD_BROADCAST_X(int, M0, N0, offset_s32, bias, offset_s32); + T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32); #endif // defined(ADD_BIAS) LOOP_UNROLLING(int, i, 0, 1, M0, diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index f1b422a68f..d34e24b436 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -233,7 +233,7 @@ __kernel void direct_convolution_nhwc( T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl index 587f3984ab..807b990e82 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -254,7 +254,7 @@ __kernel void direct_convolution3d_ndhwc( } // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl index 4f57a81e7b..b24a6ae85f 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -175,7 +175,7 @@ __kernel void dwc_native_fp_nhwc( T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c); diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl index ec2593af71..263a23ef28 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -236,7 +236,7 @@ __kernel void dwc_native_quantized_nhwc( T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); + T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); #endif // HAS_BIAS T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE); diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl index bab2ee850c..0883cd99c8 100644 --- a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl +++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl @@ -111,7 +111,7 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc( T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out); + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out); #endif // defined(HAS_BIAS) T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); @@ -177,7 +177,7 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc( T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out); #endif // defined(HAS_BIAS) T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); @@ -287,7 +287,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out); #endif // HAS_BIAS int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; @@ -374,7 +374,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out); #endif // HAS_BIAS int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; @@ -488,7 +488,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out); #endif // HAS_BIAS int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; @@ -586,7 +586,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); + T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out); #endif // HAS_BIAS int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 69898e1bb4..0ce343e3ec 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -970,6 +970,9 @@ #define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) +#define T_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) +#define T_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) + /** Element-wise activation for quantized types * * @note Performs: activation(LHS) = DST @@ -1014,11 +1017,15 @@ }) \ }) -/** Element-wise addition with RHS broadcasted (RHS has the X dimension only) +#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) + +/** Element-wise operation with RHS broadcasted (RHS has the X dimension only) * - * @note Performs: LHS + RHS[broadcasted] = DST + * @note Performs: LHS OP RHS[broadcasted] = DST * @note Both tiles must have same data type * + * @param[in] T_ELWISE_OP Elementwise operator to perform * @param[in] DST_DATA_TYPE DST data type * @param[in] M0 Number of LHS rows * @param[in] N0 Number of LHS columns @@ -1026,19 +1033,23 @@ * @param[in] rhs RHS tile * @param[out] dst DST tile */ -#define T_ADD_BROADCAST_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ +#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ ({ \ LOOP_UNROLLING(int, _m0, 0, 1, M0, \ { \ - dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)); \ + dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ }) \ }) -/** Element-wise addition between two tiles (LHS and RHS) +#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) + +/** Element-wise operation between two tiles (LHS and RHS) * - * @note Performs: LHS + RHS = DST + * @note Performs: LHS OP RHS = DST * @note Both tiles must have same data type * + * @param[in] T_ELWISE_OP Elementwise operator to perform * @param[in] DST_DATA_TYPE DST data type * @param[in] M0 Number of LHS rows * @param[in] N0 Number of LHS columns @@ -1046,11 +1057,30 @@ * @param[in] rhs RHS tile * @param[out] dst DST tile */ -#define T_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ +#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ + ({ \ + LOOP_UNROLLING(int, _m0, 0, 1, M0, \ + { \ + dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ + }) \ + }) + +/** Floor operation on a tile + * + * @note Performs: floor(SRC) = DST + * @note Both tiles must have same data type + * + * @param[in] DST_DATA_TYPE DST data type + * @param[in] M0 Number of SRC rows + * @param[in] N0 Number of SRC columns + * @param[in] src LHS tile + * @param[out] dst DST tile + */ +#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \ ({ \ LOOP_UNROLLING(int, _m0, 0, 1, M0, \ { \ - dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)); \ + dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ }) \ }) -- cgit v1.2.1