From b9626ab169a168a7c1ca57edd1996e1e80938bf1 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Mon, 13 May 2019 17:41:01 +0100
Subject: COMPMID-2243 ArgMinMaxLayer: support new datatypes

Change-Id: I846e833e0c94090cbbdcd6aee6061cea8295f4f9
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1131
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../core/CL/kernels/CLReductionOperationKernel.h   |  6 +-
 arm_compute/core/NEON/wrapper/intrinsics/inv.h     | 10 +++-
 .../runtime/NEON/functions/NEArgMinMaxLayer.h      |  4 +-
 src/core/CL/cl_kernels/reduction_operation.cl      | 67 +++++++++++++---------
 src/core/CL/kernels/CLReductionOperationKernel.cpp |  6 +-
 .../NEON/kernels/NEReductionOperationKernel.cpp    | 25 +++++---
 tests/validation/CL/ArgMinMax.cpp                  | 13 ++++-
 tests/validation/NEON/ArgMinMax.cpp                | 21 +++++++
 tests/validation/fixtures/ArgMinMaxFixture.h       | 34 +++++++----
 tests/validation/reference/ReductionOperation.cpp  |  1 +
 10 files changed, 133 insertions(+), 54 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
index b5b90a15ce..aba11e1ad1 100644
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/S32/F16/F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
@@ -61,7 +61,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/S32/F16/F32.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/inv.h b/arm_compute/core/NEON/wrapper/intrinsics/inv.h
index a86a9d4671..acb2c91feb 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/inv.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/inv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,12 +37,20 @@ namespace wrapper
         return prefix##_##postfix(a);     \
     }
 
+#define VINV_IMPL_INT(vtype, prefix, postfix) \
+    inline vtype vinv(const vtype &a)         \
+    {                                         \
+        ARM_COMPUTE_ERROR("Not supported");   \
+    }
+
 VINV_IMPL(float32x2_t, vinv, f32)
+VINV_IMPL_INT(int32x2_t, vinv, s32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VINV_IMPL(float16x4_t, vinv, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VINV_IMPL(float32x4_t, vinvq, f32)
+VINV_IMPL_INT(int32x4_t, vinvq, s32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VINV_IMPL(float16x8_t, vinvq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 87d77a5e13..55b39e45ec 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -48,7 +48,7 @@ public:
     NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Input source tensor. Data types supported: F16/F32.
+     * @param[in]  input  Input source tensor. Data types supported: QASYMM8/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
      * @param[out] output Output source tensor. Data types supported: U32.
      * @param[in]  op     Operation to perform: min or max
@@ -56,7 +56,7 @@ public:
     void configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMaxLayer
      *
-     * @param[in] input  Input source tensor info. Data types supported: F16/F32.
+     * @param[in] input  Input source tensor info. Data types supported: QASYMM8/S32/F16/F32.
      * @param[in] axis   Axis to find max/min index.
      * @param[in] output Output source tensor info. Data types supported: U32.
      * @param[in] op     Operation to perform: min or max
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index 2651123cf5..749e3cdaa3 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -23,6 +23,19 @@
  */
 #include "helpers.h"
 
+#if FLOAT_DATA_TYPE
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((int16)0, (int16)-1, x > y)
+#define ISLESS(x, y) select((int16)0, (int16)-1, x < y)
+#endif // defined(WIDTH)
+#endif // FLOAT_DATA_TYPE
+
 /** Calculate square sum of a vector
  *
  * @param[in] input Pointer to the first pixel.
@@ -124,9 +137,9 @@ __kernel void reduction_operation_x(
             {
 #if defined(PROD)
                 local_results[lid] *= local_results[lid + i];
-#else  //!defined(PROD)
+#else  // !defined(PROD)
                 local_results[lid] += local_results[lid + i];
-#endif //defined(PROD)
+#endif // defined(PROD)
             }
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -138,7 +151,7 @@ __kernel void reduction_operation_x(
             {
                 local_results[0] /= WIDTH;
             }
-#endif /* defined(MEAN) && defined(WIDTH) */
+#endif // defined(MEAN) && defined(WIDTH)
             ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
         }
     }
@@ -153,7 +166,7 @@ __kernel void reduction_operation_x(
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8 for operation MEAN
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
@@ -179,11 +192,11 @@ __kernel void reduction_operation_non_parallel_x(
     {
         DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
 #if defined(ARG_MAX)
-        indx = select(indx, x, isgreater(in, res));
-        res  = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
+        indx = select(indx, x, ISGREATER(in, res));
+        res  = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
 #elif defined(ARG_MIN)
-        indx = select(indx, x, isless(in, res));
-        res  = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
+        indx = select(indx, x, ISLESS(in, res));
+        res  = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
 #else  // !(defined(ARG_MAX) || defined(ARG_MIN))
         res += in;
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
@@ -199,7 +212,7 @@ __kernel void reduction_operation_non_parallel_x(
     *((__global uchar *)output.ptr) = convert_uchar(res);
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
-#endif /* defined(WIDTH) */
+#endif // defined(WIDTH)
 
 #if defined(HEIGHT)
 /** This kernel performs reduction on y-axis.
@@ -207,7 +220,7 @@ __kernel void reduction_operation_non_parallel_x(
  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
@@ -243,22 +256,22 @@ __kernel void reduction_operation_y(
         VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
         in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 #if defined(ARG_MAX)
-        uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+        uint16 cond_conv = CONVERT(ISGREATER(in, res), uint16);
         indx             = select(indx, y, cond_conv);
-        res              = select(res, in, isgreater(in, res));
+        res              = select(res, in, ISGREATER(in, res));
 #elif defined(ARG_MIN)
-        uint16  cond_conv           = CONVERT(isless(in, res), uint16);
+        uint16  cond_conv           = CONVERT(ISLESS(in, res), uint16);
         indx                        = select(indx, y, cond_conv);
-        res                         = select(res, in, isless(in, res));
+        res                         = select(res, in, ISLESS(in, res));
 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
 #if defined(PROD)
         res *= in;
-#else  //!defined(PROD)
+#else  // !defined(PROD)
         res += in;
-#endif //defined(PROD)
+#endif // defined(PROD)
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
@@ -272,7 +285,7 @@ __kernel void reduction_operation_y(
     vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
-#endif /* defined(HEIGHT) */
+#endif // defined(HEIGHT)
 
 #if defined(DEPTH)
 /** This kernel performs reduction on z-axis.
@@ -280,7 +293,7 @@ __kernel void reduction_operation_y(
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -330,13 +343,13 @@ __kernel void reduction_operation_z(
 #endif // defined(COMPLEX)
 
 #if defined(ARG_MAX)
-        uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+        uint16 cond_conv = CONVERT(ISGREATER(in, res), uint16);
         indx             = select(indx, z, cond_conv);
-        res              = select(res, in, isgreater(in, res));
+        res              = select(res, in, ISGREATER(in, res));
 #elif defined(ARG_MIN)
-        uint16 cond_conv = CONVERT(isless(in, res), uint16);
+        uint16 cond_conv = CONVERT(ISLESS(in, res), uint16);
         indx             = select(indx, z, cond_conv);
-        res              = select(res, in, isless(in, res));
+        res              = select(res, in, ISLESS(in, res));
 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(SUM_SQUARE)
         in *= in;
@@ -374,7 +387,7 @@ __kernel void reduction_operation_z(
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -419,13 +432,13 @@ __kernel void reduction_operation_w(
         in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
 #if defined(ARG_MAX)
-        uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+        uint16 cond_conv = CONVERT(ISGREATER(in, res), uint16);
         indx             = select(indx, w, cond_conv);
-        res              = select(res, in, isgreater(in, res));
+        res              = select(res, in, ISGREATER(in, res));
 #elif defined(ARG_MIN)
-        uint16 cond_conv = CONVERT(isless(in, res), uint16);
+        uint16 cond_conv = CONVERT(ISLESS(in, res), uint16);
         indx             = select(indx, w, cond_conv);
-        res              = select(res, in, isless(in, res));
+        res              = select(res, in, ISLESS(in, res));
 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
 #if defined(SUM_SQUARE)
         in *= in;
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index db4850f14e..cb57070612 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -49,7 +49,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     if(input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
@@ -160,8 +160,10 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     {
         data_type_promoted = "uint";
     }
+
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+    build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
     build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
@@ -199,7 +201,7 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
             if(is_serial_op)
             {
                 build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
+                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
                 kernel_axis_name = "non_parallel_x";
             }
             else
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index aa20d1f40d..5f0a4dd371 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -41,7 +41,8 @@ namespace arm_compute
 {
 namespace
 {
-uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+template <typename T>
+uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
     uint32x4_t mask{ 0 };
     if(op == ReductionOperation::ARG_IDX_MIN)
@@ -107,8 +108,8 @@ uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x
 
     return res;
 }
-
-uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+template <typename T>
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
     uint32x4_t res_idx_mask{ 0 };
     uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
@@ -124,7 +125,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_va
     {
         auto pmax    = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
         pmax         = wrapper::vpmax(pmax, pmax);
-        auto mask    = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
         res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
     }
 
@@ -394,14 +395,14 @@ struct RedOpX
                 case ReductionOperation::ARG_IDX_MIN:
                 {
                     auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_idx             = calculate_index<decltype(vec_res_value)>(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
                     vec_res_value           = temp_vec_res_value;
                     break;
                 }
                 case ReductionOperation::ARG_IDX_MAX:
                 {
                     auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_idx             = calculate_index<decltype(vec_res_value)>(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
                     vec_res_value           = temp_vec_res_value;
                     break;
                 }
@@ -446,7 +447,7 @@ struct RedOpX
             case ReductionOperation::ARG_IDX_MIN:
             case ReductionOperation::ARG_IDX_MAX:
             {
-                auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+                auto res                                      = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
                 *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
                 break;
             }
@@ -943,6 +944,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+                case DataType::S32:
+                    return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -957,6 +960,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
+                case DataType::S32:
+                    return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -971,6 +976,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
+                case DataType::S32:
+                    return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -985,6 +992,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
+                case DataType::S32:
+                    return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -1002,7 +1011,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 
     if(input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
diff --git a/tests/validation/CL/ArgMinMax.cpp b/tests/validation/CL/ArgMinMax.cpp
index 0b873945d3..6de09bed25 100644
--- a/tests/validation/CL/ArgMinMax.cpp
+++ b/tests/validation/CL/ArgMinMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,6 +90,17 @@ DATA_TEST_CASE(Configuration,
 template <typename T>
 using CLArgMinMaxValidationFixture = ArgMinMaxValidationFixture<CLTensor, CLAccessor, CLArgMinMaxLayer, T>;
 
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLArgMinMaxValidationFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index d3f70e6424..71fb39a30d 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
@@ -90,6 +91,26 @@ DATA_TEST_CASE(Configuration,
 template <typename T>
 using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
 
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxValidationFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEArgMinMaxValidationFixture<int32_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h
index e263b25bf2..ed6b51abe5 100644
--- a/tests/validation/fixtures/ArgMinMaxFixture.h
+++ b/tests/validation/fixtures/ArgMinMaxFixture.h
@@ -56,17 +56,31 @@ protected:
     template <typename U>
     void fill(U &&tensor)
     {
-        if(!is_data_type_quantized(tensor.data_type()))
+        switch(tensor.data_type())
         {
-            std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
-            library->fill(tensor, distribution, 0);
-        }
-        else
-        {
-            std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
-            std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
-
-            library->fill(tensor, distribution, 0);
+            case DataType::F32:
+            case DataType::F16:
+            {
+                std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, 0);
+                break;
+            }
+            case DataType::S32:
+            {
+                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                library->fill(tensor, distribution, 0);
+                break;
+            }
+            case DataType::QASYMM8:
+            {
+                std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+
+                library->fill(tensor, distribution, 0);
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("DataType for Elementwise Negation Not implemented");
         }
     }
 
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index fb7a6d6997..c7624a4628 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -238,6 +238,7 @@ template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src,
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 
 template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 
-- 
cgit v1.2.1