From aea14c63e2efeda9d5f7492099389d439c65204f Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 3 Jan 2019 11:10:25 +0000
Subject: COMPMID-1764 NEON: Implement ArgMax/ArgMin

Change-Id: Ibe23aa90b36ffd8553d1d1c35fada5d300fab829
Reviewed-on: https://review.mlplatform.org/475
Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
---
 .../core/NEON/wrapper/intrinsics/bitselect.h       |  64 --
 arm_compute/core/NEON/wrapper/intrinsics/bsl.h     |  64 ++
 arm_compute/core/NEON/wrapper/intrinsics/ceq.h     |  64 ++
 arm_compute/core/NEON/wrapper/intrinsics/cgt.h     |  64 ++
 arm_compute/core/NEON/wrapper/intrinsics/clt.h     |  64 ++
 .../core/NEON/wrapper/intrinsics/greaterthan.h     |  64 --
 .../core/NEON/wrapper/intrinsics/intrinsics.h      |   9 +-
 arm_compute/core/NEON/wrapper/intrinsics/orr.h     |  60 ++
 arm_compute/core/NEON/wrapper/intrinsics/pmax.h    |  53 ++
 arm_compute/core/NEON/wrapper/intrinsics/pmin.h    |  53 ++
 arm_compute/core/utils/misc/ShapeCalculator.h      |  15 +
 arm_compute/runtime/NEON/NEFunctions.h             |   1 +
 .../runtime/NEON/functions/NEArgMinMaxLayer.h      |  78 +++
 .../NEON/kernels/NEReductionOperationKernel.cpp    | 765 ++++++++++++++-------
 src/core/NEON/kernels/NESelectKernel.cpp           |  14 +-
 src/runtime/NEON/functions/NEArgMinMaxLayer.cpp    |  71 ++
 tests/validation/NEON/ArgMinMax.cpp                | 167 +++++
 tests/validation/fixtures/ArgMinMaxFixture.h       |  60 +-
 tests/validation/fixtures/ReduceMeanFixture.h      |   4 +-
 .../fixtures/ReductionOperationFixture.h           |   4 +-
 tests/validation/reference/L2NormalizeLayer.cpp    |   4 +-
 tests/validation/reference/ReductionOperation.cpp  |  59 +-
 tests/validation/reference/ReductionOperation.h    |   8 +-
 23 files changed, 1338 insertions(+), 471 deletions(-)
 delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/bitselect.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/bsl.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/ceq.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/cgt.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/clt.h
 delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/orr.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pmax.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pmin.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
 create mode 100644 src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
 create mode 100644 tests/validation/NEON/ArgMinMax.cpp

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h b/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h
deleted file mode 100644
index 8223f6d463..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT SELECT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_WRAPPER_BITSELECT_H__
-#define __ARM_COMPUTE_WRAPPER_BITSELECT_H__
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VBITSELECT_IMPL(stype, vtype, ctype, prefix, postfix)               \
-    inline vtype vbitselect(const ctype &a, const vtype &b, const vtype &c) \
-    {                                                                       \
-        return prefix##_##postfix(a, b, c);                                 \
-    }
-
-VBITSELECT_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8)
-VBITSELECT_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8)
-VBITSELECT_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16)
-VBITSELECT_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16)
-VBITSELECT_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32)
-VBITSELECT_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32)
-VBITSELECT_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBITSELECT_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VBITSELECT_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8)
-VBITSELECT_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8)
-VBITSELECT_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16)
-VBITSELECT_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16)
-VBITSELECT_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32)
-VBITSELECT_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32)
-VBITSELECT_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBITSELECT_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VBITSELECT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_WRAPPER_BITSELECT_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
new file mode 100644
index 0000000000..9831b4b842
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_BSL_H__
+#define __ARM_COMPUTE_WRAPPER_BSL_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VBSL_IMPL(vctype, vtype, prefix, postfix)                      \
+    inline vtype vbsl(const vctype &a, const vtype &b, const vtype &c) \
+    {                                                                  \
+        return prefix##_##postfix(a, b, c);                            \
+    }
+
+VBSL_IMPL(uint8x8_t, uint8x8_t, vbsl, u8)
+VBSL_IMPL(uint8x8_t, int8x8_t, vbsl, s8)
+VBSL_IMPL(uint16x4_t, uint16x4_t, vbsl, u16)
+VBSL_IMPL(uint16x4_t, int16x4_t, vbsl, s16)
+VBSL_IMPL(uint32x2_t, uint32x2_t, vbsl, u32)
+VBSL_IMPL(uint32x2_t, int32x2_t, vbsl, s32)
+VBSL_IMPL(uint32x2_t, float32x2_t, vbsl, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VBSL_IMPL(uint16x4_t, float16x4_t, vbsl, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VBSL_IMPL(uint8x16_t, uint8x16_t, vbslq, u8)
+VBSL_IMPL(uint8x16_t, int8x16_t, vbslq, s8)
+VBSL_IMPL(uint16x8_t, uint16x8_t, vbslq, u16)
+VBSL_IMPL(uint16x8_t, int16x8_t, vbslq, s16)
+VBSL_IMPL(uint32x4_t, uint32x4_t, vbslq, u32)
+VBSL_IMPL(uint32x4_t, int32x4_t, vbslq, s32)
+VBSL_IMPL(uint32x4_t, float32x4_t, vbslq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VBSL_IMPL(uint16x8_t, float16x8_t, vbslq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VBSL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_BSL_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
new file mode 100644
index 0000000000..812ac326a8
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_CEQ_H__
+#define __ARM_COMPUTE_WRAPPER_CEQ_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCEQ_IMPL(votype, vtype, prefix, postfix)      \
+    inline votype vceq(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8)
+VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8)
+VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16)
+VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16)
+VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32)
+VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32)
+VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8)
+VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8)
+VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16)
+VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16)
+VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32)
+VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32)
+VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCEQ_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_CEQ_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
new file mode 100644
index 0000000000..c2ed9df1dc
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_CGT_H__
+#define __ARM_COMPUTE_WRAPPER_CGT_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCGT_IMPL(votype, vtype, prefix, postfix)      \
+    inline votype vcgt(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8)
+VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8)
+VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16)
+VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16)
+VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32)
+VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32)
+VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8)
+VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8)
+VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16)
+VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16)
+VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32)
+VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32)
+VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCGT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_CGT_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
new file mode 100644
index 0000000000..a187c216d7
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_CLT_H__
+#define __ARM_COMPUTE_WRAPPER_CLT_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCLT_IMPL(votype, vtype, prefix, postfix)      \
+    inline votype vclt(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8)
+VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8)
+VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16)
+VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16)
+VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32)
+VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32)
+VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8)
+VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8)
+VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16)
+VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16)
+VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32)
+VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32)
+VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCLT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_CLT_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h b/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h
deleted file mode 100644
index 5ee7516a4e..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_WRAPPER_CGT_H__
-#define __ARM_COMPUTE_WRAPPER_CGT_H__
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCGT_IMPL(stype, vtype, rtype, prefix, postfix)       \
-    inline rtype vgreaterthan(const vtype &a, const vtype &b) \
-    {                                                         \
-        return prefix##_##postfix(a, b);                      \
-    }
-
-VCGT_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcgt, u8)
-VCGT_IMPL(int8_t, int8x8_t, uint8x8_t, vcgt, s8)
-VCGT_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcgt, u16)
-VCGT_IMPL(int16_t, int16x4_t, uint16x4_t, vcgt, s16)
-VCGT_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcgt, u32)
-VCGT_IMPL(int32_t, int32x2_t, uint32x2_t, vcgt, s32)
-VCGT_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcgt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcgt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCGT_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgtq, u8)
-VCGT_IMPL(int8_t, int8x16_t, uint8x16_t, vcgtq, s8)
-VCGT_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgtq, u16)
-VCGT_IMPL(int16_t, int16x8_t, uint16x8_t, vcgtq, s16)
-VCGT_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgtq, u32)
-VCGT_IMPL(int32_t, int32x4_t, uint32x4_t, vcgtq, s32)
-VCGT_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgtq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgtq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCGT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_WRAPPER_CGT_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index d00d3303f1..97af983e62 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -26,14 +26,16 @@
 
 #include "arm_compute/core/NEON/wrapper/intrinsics/add.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/bitselect.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
@@ -44,7 +46,10 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/padd.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
new file mode 100644
index 0000000000..d82dc56a6d
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_ORR_H__
+#define __ARM_COMPUTE_WRAPPER_ORR_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VORR_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vorr(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VORR_IMPL(uint8_t, uint8x8_t, vorr, u8)
+VORR_IMPL(int8_t, int8x8_t, vorr, s8)
+VORR_IMPL(uint16_t, uint16x4_t, vorr, u16)
+VORR_IMPL(int16_t, int16x4_t, vorr, s16)
+VORR_IMPL(uint32_t, uint32x2_t, vorr, u32)
+VORR_IMPL(int32_t, int32x2_t, vorr, s32)
+VORR_IMPL(uint64_t, uint64x1_t, vorr, u64)
+VORR_IMPL(int64_t, int64x1_t, vorr, s64)
+
+VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8)
+VORR_IMPL(int8_t, int8x16_t, vorrq, s8)
+VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16)
+VORR_IMPL(int16_t, int16x8_t, vorrq, s16)
+VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32)
+VORR_IMPL(int32_t, int32x4_t, vorrq, s32)
+VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64)
+VORR_IMPL(int64_t, int64x2_t, vorrq, s64)
+
+#undef VORR_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_ORR_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
new file mode 100644
index 0000000000..7f701f89c4
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_PMAX_H__
+#define __ARM_COMPUTE_WRAPPER_PMAX_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPMAX_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpmax(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8)
+VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8)
+VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16)
+VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16)
+VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32)
+VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32)
+VPMAX_IMPL(float, float32x2_t, vpmax, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPMAX_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_PMAX_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
new file mode 100644
index 0000000000..52d5eb17a0
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_PMIN_H__
+#define __ARM_COMPUTE_WRAPPER_PMIN_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPMIN_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpmin(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8)
+VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8)
+VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16)
+VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16)
+VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32)
+VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32)
+VPMIN_IMPL(float, float32x2_t, vpmin, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPMIN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_PMIN_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 70727424b3..619234d306 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1033,6 +1033,21 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
     return tiled_shape;
 }
 
+/** Calculate the reduced shape of a tensor given an axis
+ *
+ * @param[in] input Input tensor info
+ * @param[in] axis  Axis on which to perform reduction
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis)
+{
+    TensorShape output_shape{ input };
+    output_shape.set(axis, 1);
+
+    return output_shape;
+}
+
 /** Calculate the upsampled shape of a tensor
  *
  * @param[in] input Input tensor info
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 2e94030e53..2daef70cef 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
 #include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
new file mode 100644
index 0000000000..87d77a5e13
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARGMINMAXLAYER_H__
+#define __ARM_COMPUTE_NEARGMINMAXLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class IsTensor;
+
+/** Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEReductionOperationKernel
+ * -# @ref NEFillBorderKernel
+ *
+ */
+class NEArgMinMaxLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Input source tensor. Data types supported: F16/F32.
+     * @param[in]  axis   Axis to find max/min index.
+     * @param[out] output Output source tensor. Data types supported: U32.
+     * @param[in]  op     Operation to perform: min or max
+     */
+    void configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMaxLayer
+     *
+     * @param[in] input  Input source tensor info. Data types supported: F16/F32.
+     * @param[in] axis   Axis to find max/min index.
+     * @param[in] output Output source tensor info. Data types supported: U32.
+     * @param[in] op     Operation to perform: min or max
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                _memory_group;
+    NEReductionOperationKernel _reduction_kernel;
+    NEFillBorderKernel         _fill_border_kernel;
+    bool                       _run_fill_border;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEARGMINMAXLAYER_H__ */
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 9306e0303d..64e3cfe404 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
@@ -39,11 +40,225 @@ namespace arm_compute
 {
 namespace
 {
+uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4_t mask{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask = wrapper::vclt(b, a);
+    }
+
+    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
+    if(axis != 0)
+    {
+        vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 };
+
+    return res;
+}
+
+uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x4_t mask{ 0 };
+    uint8x16_t   mask_u8{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u8 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u8 = wrapper::vclt(b, a);
+    }
+    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+    mask.val[2]          = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+    mask.val[3]          = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+            { idx + 4, idx + 5, idx + 6, idx + 7 },
+            { idx + 8, idx + 9, idx + 10, idx + 11 },
+            { idx + 12, idx + 13, idx + 14, idx + 15 }
+        }
+    };
+    if(axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
+                         vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+                         vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
+                         vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
+                       };
+
+    return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+{
+    uint32x4_t res_idx_mask{ 0 };
+    uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin    = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin         = wrapper::vpmin(pmin, pmin);
+        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+    else
+    {
+        auto pmax    = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax         = wrapper::vpmax(pmax, pmax);
+        auto mask    = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+
+    res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+    auto pmin    = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+    pmin         = wrapper::vpmin(pmin, pmin);
+    uint32_t res = wrapper::vgetlane(pmin, 0);
+
+    return (res - 0xFFFFFFFF);
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x4_t res_idx_mask{ 0 };
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint8x16_t   mask_u8{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+    res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+    res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+    res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    }
+    while(iter < 4);
+
+    return (res - 0xFFFFFFFF);
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x2_t mask{ 0 };
+    uint16x8_t   mask_u16{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u16 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u16 = wrapper::vclt(b, a);
+    }
+    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+            { idx + 4, idx + 5, idx + 6, idx + 7 }
+        }
+    };
+    if(axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
+                         0, 0
+                       };
+
+    return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x2_t res_idx_mask{ 0 };
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint16x8_t   mask_u16;
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    }
+    while(iter < 2);
+
+    return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 template <class F>
 class Reducer
 {
 public:
-    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set out window
         Window out_window(window);
@@ -58,11 +273,11 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info());
+            f(in, out, in_slice, out_slice, *input->info(), op);
         }
         while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
     }
-    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in window
         Window in_window(window);
@@ -80,11 +295,11 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 1);
+            f(in, out, in_slice, out_slice, *input->info(), 1, op);
         }
         while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
     }
-    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in window
         Window in_window(window);
@@ -102,11 +317,11 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 2);
+            f(in, out, in_slice, out_slice, *input->info(), 2, op);
         }
         while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
     }
-    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in/out window
         Window in_window(window);
@@ -124,115 +339,205 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 3);
+            f(in, out, in_slice, out_slice, *input->info(), 3, op);
         }
         while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
     }
 };
 
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
 struct RedOpX
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+        auto init_res_value = static_cast<T>(0.f);
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            init_res_value = *reinterpret_cast<T *>(input.ptr());
+        }
+        auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+        uint32x4x4_t vec_res_idx{ 0 };
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
             const auto in_ptr       = reinterpret_cast<const T *>(input.ptr());
             const auto vec_elements = wrapper::vloadq(in_ptr);
 
-            if(op == ReductionOperation::SUM_SQUARE)
-            {
-                vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
-            }
-            else
+            switch(op)
             {
-                vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                case ReductionOperation::SUM_SQUARE:
+                    vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                    break;
+                case ReductionOperation::MEAN_SUM:
+                case ReductionOperation::SUM:
+                    vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                    break;
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
             }
         },
         input);
 
-        auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
-        for(int i = 0; i < S / 4; ++i)
+        switch(op)
         {
-            carry_addition = wrapper::vpadd(carry_addition, carry_addition);
-        }
+            case ReductionOperation::SUM:
+            case ReductionOperation::SUM_SQUARE:
+            case ReductionOperation::MEAN_SUM:
+            {
+                auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                for(int i = 0; i < S / 4; ++i)
+                {
+                    carry_res = wrapper::vpadd(carry_res, carry_res);
+                }
+                auto res = wrapper::vgetlane(carry_res, 0);
 
-        auto res = wrapper::vgetlane(carry_addition, 0);
-        if(op == ReductionOperation::MEAN_SUM)
-        {
-            res /= in_info.dimension(0);
-        }
+                if(op == ReductionOperation::MEAN_SUM)
+                {
+                    res /= in_info.dimension(0);
+                }
 
-        *(reinterpret_cast<T *>(output.ptr())) = res;
+                *(reinterpret_cast<T *>(output.ptr())) = res;
+                break;
+            }
+            case ReductionOperation::ARG_IDX_MIN:
+            case ReductionOperation::ARG_IDX_MAX:
+            {
+                auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+                *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+        }
     }
 };
 
-template <ReductionOperation op>
 struct RedOpX_qasymm8
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+
+        uint8x16_t vec_res_value;
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+        }
 
+        uint32x4x4_t vec_res_idx{ 0 };
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
             const auto vec_elements = wrapper::vloadq(input.ptr());
-
-            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-            vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
-            vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
-            vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
-            vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+            switch(op)
+            {
+                case ReductionOperation::SUM:
+                case ReductionOperation::MEAN_SUM:
+                {
+                    const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                    const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                    const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                    const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                    const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                    const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                    vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                    vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                    vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                    vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
         },
         input);
 
-        auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
-        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value3);
-        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value4);
-
-        auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
-        carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-        auto res             = wrapper::vgetlane(carry_paddition, 0);
-
-        if(op == ReductionOperation::MEAN_SUM)
+        if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
         {
-            res /= in_info.dimension(0);
+            auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+            *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
         }
+        else
+        {
+            auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+            carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+            carry_res      = wrapper::vadd(carry_res, vec_res_value4);
 
-        *(output.ptr()) = static_cast<uint8_t>(res);
+            auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+            carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
+            auto res             = wrapper::vgetlane(carry_paddition, 0);
+
+            if(op == ReductionOperation::MEAN_SUM)
+            {
+                res /= in_info.dimension(0);
+            }
+
+            *(output.ptr()) = static_cast<uint8_t>(res);
+        }
     }
 };
 
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
 struct RedOpYZW
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            neon_vector vec_res_value;
+            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+            {
+                vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+            }
+            else
+            {
+                vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            }
+            uint32x4x4_t vec_res_idx{ 0 };
+
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
                 T *in_ptr;
@@ -252,41 +557,68 @@ struct RedOpYZW
                 }
                 const auto vec_elements = wrapper::vloadq(in_ptr);
 
-                if(op == ReductionOperation::SUM_SQUARE)
+                switch(op)
                 {
-                    vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
-                }
-                else
-                {
-                    vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                        break;
+                    case ReductionOperation::SUM_SQUARE:
+                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                        break;
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
             }
 
             if(op == ReductionOperation::MEAN_SUM)
             {
                 auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                vec_sum_value      = wrapper::vmul(vec_sum_value, vec_width_inv);
+                vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
             }
 
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+            {
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+            }
+            else
+            {
+                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+            }
         },
         input, output);
     }
 };
 
-template <ReductionOperation op>
 struct RedOpYZW_qasymm8
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+            uint32x4x4_t vec_res_idx{ 0 };
+            auto         vec_res_value1 = vdupq_n_u32(0);
+            auto         vec_res_value2 = vdupq_n_u32(0);
+            auto         vec_res_value3 = vdupq_n_u32(0);
+            auto         vec_res_value4 = vdupq_n_u32(0);
+            auto         vec_res_value  = wrapper::vloadq(input.ptr());
+
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
                 uint8_t *in_ptr;
@@ -306,169 +638,78 @@ struct RedOpYZW_qasymm8
                 }
                 const auto vec_elements = wrapper::vloadq(in_ptr);
 
-                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
-                vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
-                vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
-                vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+                switch(op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
             }
 
             if(op == ReductionOperation::MEAN_SUM)
             {
                 const auto vec_width_inv    = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
-                const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
-                const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
-                const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
-                const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
-
-                vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
-                vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
-                vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
-                vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
-            }
-
-            const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
-            const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
-            auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-            wrapper::vstore(output.ptr(), res);
-        },
-        input, output);
-    }
-};
-
-void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                const auto vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv);
+                const auto vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv);
+                const auto vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv);
+                const auto vec_res_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv);
+
+                vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+                vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+                vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+                vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
             }
-        case 2:
-            switch(input->info()->data_type())
+            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
             {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
             }
-        case 3:
-            switch(input->info()->data_type())
+            else
             {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+                wrapper::vstore(output.ptr(), res);
             }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction axis");
-    }
-}
 
-void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 2:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 3:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+        },
+        input, output);
     }
-}
-void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
     switch(axis)
     {
@@ -476,13 +717,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -490,13 +731,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -504,13 +745,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -518,13 +759,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -533,14 +774,6 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
     }
 }
 
-TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
-{
-    TensorShape output_shape{ input_shape };
-    output_shape.set(axis, 1);
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_UNUSED(op);
@@ -553,10 +786,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+        if(!is_arg_min_max)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+        }
+
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
 
-        const TensorShape output_shape         = calculate_output_shape(input->tensor_shape(), axis);
+        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
         const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
@@ -564,13 +806,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     // Calculate output shape and set if empty
-    const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
+    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+    auto_init_if_empty(*output, output_shape, 1, output_data_type);
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
 
@@ -613,7 +857,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
     _reduction_axis = axis;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
 
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
@@ -623,7 +867,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
 Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
 
     return Status{};
 }
@@ -634,19 +878,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_op)
-    {
-        case ReductionOperation::SUM_SQUARE:
-            reduce_sumsq(window, _input, _output, _reduction_axis);
-            break;
-        case ReductionOperation::MEAN_SUM:
-            reduce_mean_sum(window, _input, _output, _reduction_axis);
-            break;
-        case ReductionOperation::SUM:
-            reduce_sum(window, _input, _output, _reduction_axis);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction operation.");
-    }
+    reduce_op(window, _input, _output, _reduction_axis, _op);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 0c134c00ed..f2697bcc6d 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,9 +14,9 @@
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
@@ -67,7 +67,7 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen
             const auto c = (*condition_conversion)(condition_ptr + x);
             const auto a = wrapper::vloadq(input1_ptr + x);
             const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbitselect(c, a, b));
+            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
         }
         for(; x < window_end_x; ++x)
         {
@@ -90,7 +90,7 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT
     select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
     {
         static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vgreaterthan(wrapper::vloadq(condition_ptr), zero);
+        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
     });
 }
 
@@ -104,7 +104,7 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
     select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
     {
         static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vgreaterthan(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
     });
 }
 
@@ -118,7 +118,7 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
     select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
     {
         static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vgreaterthan(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
     });
 }
 
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
new file mode 100644
index 0000000000..d33e1342b9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+{
+}
+void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
+{
+    _reduction_kernel.configure(input, output, axis, op);
+
+    if(axis == 0)
+    {
+        _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
+        _run_fill_border = true;
+    }
+}
+
+Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+    return Status{};
+}
+
+void NEArgMinMaxLayer::run()
+{
+    _memory_group.acquire();
+
+    if(_run_fill_border)
+    {
+        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    }
+    NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+
+    _memory_group.release();
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
new file mode 100644
index 0000000000..611495a41d
--- /dev/null
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SplitDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ArgMinMaxFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(ArgMinMax)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis
+                                                TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape
+                                                TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) // Invalid operation
+        }),
+        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
+                                                 TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
+                                                 TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::U32),
+                                                 TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32)
+        })),
+        framework::dataset::make("Axis", { 4, 0, 2, 0 })),
+        framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::MEAN_SUM })),
+        framework::dataset::make("Expected", { false, false, true, false })),
+        input_info, output_info, axis, operation, expected)
+{
+    const Status status = NEArgMinMaxLayer::validate(&input_info.clone()->set_is_resizable(false), axis, &output_info.clone()->set_is_resizable(false), operation);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+DATA_TEST_CASE(Configuration,
+               framework::DatasetMode::ALL,
+               combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
+               shape, data_type)
+{
+    // Create tensors
+    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
+    Tensor dst;
+
+    // Create and Configure function
+    NEArgMinMaxLayer arg_min_max_layer;
+    arg_min_max_layer.configure(&ref_src, 1, &dst, ReductionOperation::ARG_IDX_MAX);
+
+    // Validate valid region
+    TensorShape output_shape = shape;
+    output_shape.set(1, 1);
+    const ValidRegion valid_region = shape_to_valid_region(output_shape);
+    validate(dst.info()->valid_region(), valid_region);
+}
+
+template <typename T>
+using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+
+TEST_SUITE(Float)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxValidationFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEArgMinMaxValidationFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP16
+#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T>
+using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
+                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
+                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxValidationFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEArgMinMaxValidationFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // ArgMinMax
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h
index 5f5f85c104..e263b25bf2 100644
--- a/tests/validation/fixtures/ArgMinMaxFixture.h
+++ b/tests/validation/fixtures/ArgMinMaxFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,28 +42,38 @@ namespace test
 namespace validation
 {
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ArgMinMaxValidationFixture : public framework::Fixture
+class ArgMinMaxValidationBaseFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op)
+    void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
     {
-        _target    = compute_target(shape, data_type, axis, op);
-        _reference = compute_reference(shape, data_type, axis, op);
+        _target    = compute_target(shape, data_type, axis, op, q_info);
+        _reference = compute_reference(shape, data_type, axis, op, q_info);
     }
 
 protected:
     template <typename U>
     void fill(U &&tensor)
     {
-        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
-        library->fill(tensor, distribution, 0);
+        if(!is_data_type_quantized(tensor.data_type()))
+        {
+            std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, 0);
+        }
+        else
+        {
+            std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+            std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+
+            library->fill(tensor, distribution, 0);
+        }
     }
 
-    TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op)
+    TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
     {
         // Create tensors
-        TensorType src = create_tensor<TensorType>(src_shape, data_type, 1);
+        TensorType src = create_tensor<TensorType>(src_shape, data_type, 1, q_info);
         TensorType dst;
 
         // Create and configure function
@@ -89,21 +99,43 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op)
+    SimpleTensor<uint32_t> compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
     {
         // Create reference
-        SimpleTensor<T> src{ src_shape, data_type, 1 };
+        SimpleTensor<T> src{ src_shape, data_type, 1, q_info };
 
         // Fill reference
         fill(src);
 
         TensorShape output_shape = src_shape;
         output_shape.set(axis, 1);
-        return reference::reduction_operation<T>(src, output_shape, axis, op);
+        return reference::reduction_operation<T, uint32_t>(src, output_shape, axis, op);
     }
 
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
+    TensorType             _target{};
+    SimpleTensor<uint32_t> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArgMinMaxValidationQuantizedFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo quantization_info)
+    {
+        ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, quantization_info);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArgMinMaxValidationFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op)
+    {
+        ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, QuantizationInfo());
+    }
 };
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index 769d7f674f..44bb9fca6a 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,7 +113,7 @@ protected:
         {
             TensorShape output_shape = i == 0 ? src_shape : out.shape();
             output_shape.set(axis[i], 1);
-            out = reference::reduction_operation<T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM);
+            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM);
         }
 
         if(!keep_dims)
diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h
index 9079b47cbb..d01f41abf0 100644
--- a/tests/validation/fixtures/ReductionOperationFixture.h
+++ b/tests/validation/fixtures/ReductionOperationFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,7 +107,7 @@ protected:
         // Fill reference
         fill(src);
 
-        return reference::reduction_operation<T>(src, dst_shape, axis, op);
+        return reference::reduction_operation<T, T>(src, dst_shape, axis, op);
     }
 
     TensorType      _target{};
diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp
index fcd6226f07..43885b29e2 100644
--- a/tests/validation/reference/L2NormalizeLayer.cpp
+++ b/tests/validation/reference/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@ SimpleTensor<T> l2_normalize(const SimpleTensor<T> &src, unsigned int axis, floa
     SimpleTensor<T> dst{ src.shape(), src.data_type() };
 
     // Reduce across given axis
-    SimpleTensor<T> sum = reduction_operation(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE);
+    SimpleTensor<T> sum = reduction_operation<T, T>(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE);
 
     // Compute reference
     const int upper_dims     = src.shape().total_size_upper(axis + 1);
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index 37a9be86c0..fc12e31d75 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,20 +49,20 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
         uint32_t int_res = 0;
         for(int i = 0; i < reduce_elements; ++i)
         {
-            auto elem = static_cast<uint32_t>(*(ptr + stride * i));
+            auto elem = *(ptr + stride * i);
 
             switch(op)
             {
                 case ReductionOperation::ARG_IDX_MIN:
-                    if(static_cast<uint32_t>(*(ptr + stride * static_cast<uint32_t>(res))) > elem)
+                    if(*(ptr + stride * static_cast<uint32_t>(int_res)) > elem)
                     {
-                        res = static_cast<uint32_t>(i);
+                        int_res = static_cast<uint32_t>(i);
                     }
                     break;
                 case ReductionOperation::ARG_IDX_MAX:
-                    if(static_cast<uint32_t>(*(ptr + stride * static_cast<uint32_t>(res))) < elem)
+                    if(*(ptr + stride * static_cast<uint32_t>(int_res)) < elem)
                     {
-                        res = static_cast<uint32_t>(i);
+                        int_res = static_cast<uint32_t>(i);
                     }
                     break;
                 case ReductionOperation::SUM_SQUARE:
@@ -122,13 +122,13 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
 }
 } // namespace
 
-template <typename T>
-SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+template <typename T, typename OT>
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
 {
     // Create reference
     const bool         is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType           output_data_type = is_arg_min_max ? DataType::U32 : src.data_type();
-    SimpleTensor<T>    dst{ dst_shape, output_data_type, 1, src.quantization_info() };
+    SimpleTensor<OT>   dst{ dst_shape, output_data_type, 1, src.quantization_info() };
     const unsigned int src_width    = src.shape().x();
     const unsigned int src_height   = src.shape().y();
     const unsigned int src_depth    = src.shape().z();
@@ -143,14 +143,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
             for(unsigned int du = 0; du < upper_dims; ++du)
             {
                 const T *src_row_ptr = src.data() + du * reduce_elems;
-                if(is_arg_min_max)
-                {
-                    dst[du] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, 1);
-                }
-                else
-                {
-                    dst[du] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, 1);
-                }
+                dst[du]              = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1);
             }
         }
         break;
@@ -164,15 +157,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
                     const int in_offset   = du * src_height * src_width + x;
                     const int out_offset  = du * src_width + x;
                     const T *src_row_ptr = src.data() + in_offset;
-
-                    if(is_arg_min_max)
-                    {
-                        dst[out_offset] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, src_width);
-                    }
-                    else
-                    {
-                        dst[out_offset] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, src_width);
-                    }
+                    dst[out_offset]       = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width);
                 }
             }
         }
@@ -189,15 +174,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
                         const int in_offset   = du * src_depth * src_height * src_width + y * src_width + x;
                         const int out_offset  = du * src_width * src_height + y * src_width + x;
                         const T *src_row_ptr = src.data() + in_offset;
-
-                        if(is_arg_min_max)
-                        {
-                            dst[out_offset] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, src_height * src_width);
-                        }
-                        else
-                        {
-                            dst[out_offset] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, src_height * src_width);
-                        }
+                        dst[out_offset]       = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_height * src_width);
                     }
                 }
             }
@@ -217,14 +194,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
                             const int in_offset   = du * src_batch * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
                             const int out_offset  = du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
                             const T *src_row_ptr = src.data() + in_offset;
-                            if(is_arg_min_max)
-                            {
-                                dst[out_offset] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
-                            }
-                            else
-                            {
-                                dst[out_offset] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
-                            }
+                            dst[out_offset]       = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
                         }
                     }
                 }
@@ -238,6 +208,9 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
     return dst;
 }
 
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 859b57aa7b..9f7050f551 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,10 +35,10 @@ namespace validation
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template <typename T, typename OT>
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FLOOR_H__ */
+#endif /* __ARM_COMPUTE_TEST_REDUCTION_OPERATION_H__ */
-- 
cgit v1.2.1