From 3ecf9fefa6f6299a0736599f150d4791cc8345d9 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Wed, 28 Apr 2021 16:11:51 +0100 Subject: Remove OpenCL padding: CLReductionOperationKernel Change the parallel implementation across the X, now every thread computes one row Add missing test for MEAN_SUM Make reduction on any axis != 0 work with num_channels > 1 Resolve COMPMID-3917 Signed-off-by: Giorgio Arena Change-Id: Ib0f99540104e3c253bcd1ea637833db533f5e76e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5522 Comments-Addressed: Arm Jenkins Reviewed-by: Manuel Bottini Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/helpers.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'src/core/CL/cl_kernels/helpers.h') diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index 2eae5ee1c9..6cd76373d2 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -567,6 +567,16 @@ #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) #define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) +#define prod_reduce_1(x) (x) +#define prod_reduce_2(x) ((x).s0) * ((x).s1) +#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) +#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) +#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) +#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) + +#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) +#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) + #define max_reduce_1(x) (x) #define max_reduce_2(x) max(((x).s0), ((x).s1)) #define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) -- cgit v1.2.1