Optimize CLReduce for Min/Max Axis=0

Resolves: COMPMID-6400 Change-Id: Id9935f9727f77a824afc75c35f044e3f5c173e0d Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10120 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2023-07-18 15:57:23 +0100
committer: Gunes Bayir <gunes.bayir@arm.com> 2023-08-14 09:57:15 +0000
commit: 338ef4699735db16d346e52f05b822f8fd5e3263 (patch)
tree: 91ffe06a4a3960ee577673a47be0a99629012679
parent: 633ebd18d982496a6a626df3b8336f610ab09eb4 (diff)
download: ComputeLibrary-338ef4699735db16d346e52f05b822f8fd5e3263.tar.gz
4 files changed, 21 insertions, 3 deletions
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index 9f2c6e23b5..1cb6664078 100644
--- a/src/core/CL/cl_kernels/common/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,6 +47,8 @@
 #define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
 #define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
 #define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
+#define min_(in0, in1, size) (min(in0, MIN_REDUCE(in1, size)))
+#define max_(in0, in1, size) (max(in0, MAX_REDUCE(in1, size)))
 
 /** This kernel performs parallel reduction given an operation on x-axis.
  *
@@ -79,12 +81,15 @@ __kernel void reduction_operation_x(
     __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
     __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
 
+#if !defined(MIN) && !defined(MAX)
 #if defined(PROD)
     DATA_TYPE res = (DATA_TYPE)1;
 #else  // defined(PROD)
     DATA_TYPE res = (DATA_TYPE)0;
 #endif // defined(PROD)
-
+#else  // #if !defined(MIN) && !defined(MAX)
+    DATA_TYPE res = *((__global DATA_TYPE *)input_addr);
+#endif // #if defined(MIN) || defined(MAX)
     int x = 0;
 
     for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index e0fd8dc3e3..b2ceaf92f3 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -824,6 +824,16 @@
 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
 #define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
 
+#define min_reduce_1(x) (x)
+#define min_reduce_2(x) min(((x).s0), ((x).s1))
+#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
+
+#define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
+#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size)
+
 #define VECTOR_DECLARATION(name)     \
     __global uchar *name##_ptr,      \
     uint        name##_stride_x, \
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index ee60b8e1df..e5cfb997ca 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -143,7 +143,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
             build_opts.add_option(("-DOPERATION=sum"));
             break;
         case ReductionOperation::MIN:
+            build_opts.add_option(("-DOPERATION=min_"));
+            break;
         case ReductionOperation::MAX:
+            build_opts.add_option(("-DOPERATION=max_"));
             break;
         case ReductionOperation::PROD:
             build_opts.add_option(("-DOPERATION=product"));
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 0701ee7c90..1ca7adb3a8 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -390,7 +390,7 @@ bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int
     const bool is_quantized_type = is_data_type_quantized(dt);
     const bool is_first_dim      = (axis == 0);
 
-    return !is_first_dim || is_min_max || is_quantized_type;
+    return !is_first_dim || (is_quantized_type && !is_min_max);
 }
 
 QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log)
author	Gunes Bayir <gunes.bayir@arm.com>	2023-07-18 15:57:23 +0100
committer	Gunes Bayir <gunes.bayir@arm.com>	2023-08-14 09:57:15 +0000
commit	338ef4699735db16d346e52f05b822f8fd5e3263 (patch)
tree	91ffe06a4a3960ee577673a47be0a99629012679
parent	633ebd18d982496a6a626df3b8336f610ab09eb4 (diff)
download	ComputeLibrary-338ef4699735db16d346e52f05b822f8fd5e3263.tar.gz