From 8dfb8820d5fe0f72a923eccc3bb73ee0b87d5511 Mon Sep 17 00:00:00 2001
From: Pablo Marquez Tello <pablo.tello@arm.com>
Date: Thu, 13 Jul 2023 15:45:23 +0100
Subject: Enable S64 output in CLArgMinMax

Resolves MLCE-1089

Change-Id: I8b385ef8a00ec5de60299bc7a359766ba5417e68
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9918
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/common/arg_min_max.cl   | 18 +++++++++---------
 src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp |  4 +++-
 src/runtime/CL/functions/CLArgMinMaxLayer.cpp  |  5 +++++
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/core/CL/cl_kernels/common/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
index 438f46eb24..413fcf5333 100644
--- a/src/core/CL/cl_kernels/common/arg_min_max.cl
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
@@ -85,9 +85,9 @@ inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_4(DATA_TYPE *min_max_val,
     VEC_DATA_TYPE(COND_DATA_TYPE, 2)
     idx_sel       = VECTOR_PREDICATE_EQ(in.s01, in.s23);
     in.s01      = select(in.s23, in.s01, idx_sel);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel, int2));
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
     idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
-    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
     *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
     *min_max_idx  = res.s0;
 }
@@ -97,12 +97,12 @@ inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_8(DATA_TYPE *min_max_val,
     VEC_DATA_TYPE(COND_DATA_TYPE, 4)
     idx_sel       = VECTOR_PREDICATE_EQ(in.s0123, in.s4567);
     in.s0123      = select(in.s4567, in.s0123, idx_sel);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel, int4));
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
     idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
     in.s01        = select(in.s23, in.s01, idx_sel.s01);
-    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
     idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
-    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
     *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
     *min_max_idx  = res.s0;
 }
@@ -112,15 +112,15 @@ inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_16(DATA_TYPE *min_max_val
     VEC_DATA_TYPE(COND_DATA_TYPE, 8)
     idx_sel       = VECTOR_PREDICATE_EQ(in.s01234567, in.s89abcdef);
     in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) ));
     idx_sel.s0123 = VECTOR_PREDICATE(in.s0123, in.s4567) || (in.s0123 == in.s4567 && CONVERT(((res.s0123 < res.s4567)), VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
     in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
     idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
     in.s01        = select(in.s23, in.s01, idx_sel.s01);
-    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
     idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
-    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
     *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
     *min_max_idx  = res.s0;
 }
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 41f885e4ba..2728958add 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -45,13 +45,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64);
     }
 
     return Status{};
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index ea6311afdb..b30d739025 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -96,10 +96,15 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const
     DataType          output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
+    TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() };
+    not_reshaped_output_shape.set(axis, 1);
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+
     _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
     _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
 
     _memory_group.manage(&_not_reshaped_output);
+
     _reshape.configure(compile_context, &_not_reshaped_output, output);
     _not_reshaped_output.allocator()->allocate();
 }
-- 
cgit v1.2.1