COMPMID-1764 NEON: Implement ArgMax/ArgMin

Change-Id: Ibe23aa90b36ffd8553d1d1c35fada5d300fab829 Reviewed-on: https://review.mlplatform.org/475 Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
author: Michalis Spyrou <michalis.spyrou@arm.com> 2019-01-03 11:10:25 +0000
committer: Michalis Spyrou <michalis.spyrou@arm.com> 2019-01-10 16:24:26 +0000
commit: aea14c63e2efeda9d5f7492099389d439c65204f (patch)
tree: 176a6181bbf00e4df078d5da0a17dd44f248958e /src/core/NEON/kernels/NEReductionOperationKernel.cpp
parent: c10bc0b5db5169a6ccea02a1aaefe34f082709e5 (diff)
download: ComputeLibrary-aea14c63e2efeda9d5f7492099389d439c65204f.tar.gz
1 files changed, 498 insertions, 267 deletions
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 9306e0303d..64e3cfe404 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
@@ -39,11 +40,225 @@ namespace arm_compute
 {
 namespace
 {
+uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4_t mask{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask = wrapper::vclt(b, a);
+    }
+
+    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
+    if(axis != 0)
+    {
+        vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 };
+
+    return res;
+}
+
+uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x4_t mask{ 0 };
+    uint8x16_t   mask_u8{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u8 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u8 = wrapper::vclt(b, a);
+    }
+    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+    mask.val[2]          = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+    mask.val[3]          = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+            { idx + 4, idx + 5, idx + 6, idx + 7 },
+            { idx + 8, idx + 9, idx + 10, idx + 11 },
+            { idx + 12, idx + 13, idx + 14, idx + 15 }
+        }
+    };
+    if(axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
+                         vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+                         vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
+                         vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
+                       };
+
+    return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+{
+    uint32x4_t res_idx_mask{ 0 };
+    uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin    = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin         = wrapper::vpmin(pmin, pmin);
+        auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+    else
+    {
+        auto pmax    = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax         = wrapper::vpmax(pmax, pmax);
+        auto mask    = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+        res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+    }
+
+    res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+    auto pmin    = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+    pmin         = wrapper::vpmin(pmin, pmin);
+    uint32_t res = wrapper::vgetlane(pmin, 0);
+
+    return (res - 0xFFFFFFFF);
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x4_t res_idx_mask{ 0 };
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint8x16_t   mask_u8{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+    res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+    res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+    res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    }
+    while(iter < 4);
+
+    return (res - 0xFFFFFFFF);
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+    uint32x4x2_t mask{ 0 };
+    uint16x8_t   mask_u16{ 0 };
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        mask_u16 = wrapper::vcgt(b, a);
+    }
+    else
+    {
+        mask_u16 = wrapper::vclt(b, a);
+    }
+    mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+    mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+            { idx + 4, idx + 5, idx + 6, idx + 7 }
+        }
+    };
+    if(axis != 0)
+    {
+        vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+        vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+    }
+    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
+                         0, 0
+                       };
+
+    return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+    uint32x4x2_t res_idx_mask{ 0 };
+    uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
+    uint16x8_t   mask_u16;
+    if(op == ReductionOperation::ARG_IDX_MIN)
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        pmin      = wrapper::vpmin(pmin, pmin);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+    }
+    else
+    {
+        auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+        pmax      = wrapper::vpmax(pmax, pmax);
+        pmax      = wrapper::vpmax(pmax, pmax);
+        mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+    }
+
+    // Widen vectors
+    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+    res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+    res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+    res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+    uint32_t res  = 0xFFFFFFFF;
+    int      iter = 0;
+    do
+    {
+        auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+        pmin      = wrapper::vpmin(pmin, pmin);
+        res       = std::min(wrapper::vgetlane(pmin, 0), res);
+        iter++;
+    }
+    while(iter < 2);
+
+    return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 template <class F>
 class Reducer
 {
 public:
-    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set out window
         Window out_window(window);
@@ -58,11 +273,11 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info());
+            f(in, out, in_slice, out_slice, *input->info(), op);
         }
         while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
     }
-    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in window
         Window in_window(window);
@@ -80,11 +295,11 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 1);
+            f(in, out, in_slice, out_slice, *input->info(), 1, op);
         }
         while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
     }
-    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in window
         Window in_window(window);
@@ -102,11 +317,11 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 2);
+            f(in, out, in_slice, out_slice, *input->info(), 2, op);
         }
         while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
     }
-    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
         // Set in/out window
         Window in_window(window);
@@ -124,115 +339,205 @@ public:
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice, *input->info(), 3);
+            f(in, out, in_slice, out_slice, *input->info(), 3, op);
         }
         while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
     }
 };
 
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
 struct RedOpX
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+        auto init_res_value = static_cast<T>(0.f);
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            init_res_value = *reinterpret_cast<T *>(input.ptr());
+        }
+        auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+        uint32x4x4_t vec_res_idx{ 0 };
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
             const auto in_ptr       = reinterpret_cast<const T *>(input.ptr());
             const auto vec_elements = wrapper::vloadq(in_ptr);
 
-            if(op == ReductionOperation::SUM_SQUARE)
-            {
-                vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
-            }
-            else
+            switch(op)
             {
-                vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                case ReductionOperation::SUM_SQUARE:
+                    vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                    break;
+                case ReductionOperation::MEAN_SUM:
+                case ReductionOperation::SUM:
+                    vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                    break;
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
             }
         },
         input);
 
-        auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
-        for(int i = 0; i < S / 4; ++i)
+        switch(op)
         {
-            carry_addition = wrapper::vpadd(carry_addition, carry_addition);
-        }
+            case ReductionOperation::SUM:
+            case ReductionOperation::SUM_SQUARE:
+            case ReductionOperation::MEAN_SUM:
+            {
+                auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                for(int i = 0; i < S / 4; ++i)
+                {
+                    carry_res = wrapper::vpadd(carry_res, carry_res);
+                }
+                auto res = wrapper::vgetlane(carry_res, 0);
 
-        auto res = wrapper::vgetlane(carry_addition, 0);
-        if(op == ReductionOperation::MEAN_SUM)
-        {
-            res /= in_info.dimension(0);
-        }
+                if(op == ReductionOperation::MEAN_SUM)
+                {
+                    res /= in_info.dimension(0);
+                }
 
-        *(reinterpret_cast<T *>(output.ptr())) = res;
+                *(reinterpret_cast<T *>(output.ptr())) = res;
+                break;
+            }
+            case ReductionOperation::ARG_IDX_MIN:
+            case ReductionOperation::ARG_IDX_MAX:
+            {
+                auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+                *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+        }
     }
 };
 
-template <ReductionOperation op>
 struct RedOpX_qasymm8
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-        auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_res_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+
+        uint8x16_t vec_res_value;
+        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+        {
+            vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+        }
 
+        uint32x4x4_t vec_res_idx{ 0 };
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
             const auto vec_elements = wrapper::vloadq(input.ptr());
-
-            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-            vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
-            vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
-            vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
-            vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+            switch(op)
+            {
+                case ReductionOperation::SUM:
+                case ReductionOperation::MEAN_SUM:
+                {
+                    const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                    const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                    const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                    const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                    const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                    const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                    vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                    vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                    vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                    vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    vec_res_idx             = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                    vec_res_value           = temp_vec_res_value;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
         },
         input);
 
-        auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
-        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value3);
-        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value4);
-
-        auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
-        carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-        auto res             = wrapper::vgetlane(carry_paddition, 0);
-
-        if(op == ReductionOperation::MEAN_SUM)
+        if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
         {
-            res /= in_info.dimension(0);
+            auto res                                      = calculate_vector_index(vec_res_idx, vec_res_value, op);
+            *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
         }
+        else
+        {
+            auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+            carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+            carry_res      = wrapper::vadd(carry_res, vec_res_value4);
 
-        *(output.ptr()) = static_cast<uint8_t>(res);
+            auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+            carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
+            auto res             = wrapper::vgetlane(carry_paddition, 0);
+
+            if(op == ReductionOperation::MEAN_SUM)
+            {
+                res /= in_info.dimension(0);
+            }
+
+            *(output.ptr()) = static_cast<uint8_t>(res);
+        }
     }
 };
 
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
 struct RedOpYZW
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            neon_vector vec_res_value;
+            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+            {
+                vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+            }
+            else
+            {
+                vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            }
+            uint32x4x4_t vec_res_idx{ 0 };
+
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
                 T *in_ptr;
@@ -252,41 +557,68 @@ struct RedOpYZW
                 }
                 const auto vec_elements = wrapper::vloadq(in_ptr);
 
-                if(op == ReductionOperation::SUM_SQUARE)
+                switch(op)
                 {
-                    vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
-                }
-                else
-                {
-                    vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                        break;
+                    case ReductionOperation::SUM_SQUARE:
+                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                        break;
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
             }
 
             if(op == ReductionOperation::MEAN_SUM)
             {
                 auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                vec_sum_value      = wrapper::vmul(vec_sum_value, vec_width_inv);
+                vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
             }
 
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+            {
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+            }
+            else
+            {
+                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+            }
         },
         input, output);
     }
 };
 
-template <ReductionOperation op>
 struct RedOpYZW_qasymm8
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
     {
         ARM_COMPUTE_UNUSED(out_slice);
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
-            auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+            uint32x4x4_t vec_res_idx{ 0 };
+            auto         vec_res_value1 = vdupq_n_u32(0);
+            auto         vec_res_value2 = vdupq_n_u32(0);
+            auto         vec_res_value3 = vdupq_n_u32(0);
+            auto         vec_res_value4 = vdupq_n_u32(0);
+            auto         vec_res_value  = wrapper::vloadq(input.ptr());
+
             for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
             {
                 uint8_t *in_ptr;
@@ -306,169 +638,78 @@ struct RedOpYZW_qasymm8
                 }
                 const auto vec_elements = wrapper::vloadq(in_ptr);
 
-                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
-                vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
-                vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
-                vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+                switch(op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
             }
 
             if(op == ReductionOperation::MEAN_SUM)
             {
                 const auto vec_width_inv    = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
-                const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
-                const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
-                const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
-                const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
-
-                vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
-                vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
-                vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
-                vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
-            }
-
-            const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
-            const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
-            auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-            wrapper::vstore(output.ptr(), res);
-        },
-        input, output);
-    }
-};
-
-void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                const auto vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv);
+                const auto vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv);
+                const auto vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv);
+                const auto vec_res_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv);
+
+                vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+                vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+                vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+                vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
             }
-        case 2:
-            switch(input->info()->data_type())
+            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
             {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
+                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
             }
-        case 3:
-            switch(input->info()->data_type())
+            else
             {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
-                case DataType::QASYMM8:
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+                wrapper::vstore(output.ptr(), res);
             }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction axis");
-    }
-}
 
-void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
-    switch(axis)
-    {
-        case 0:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 1:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 2:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        case 3:
-            switch(input->info()->data_type())
-            {
-                case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+        },
+        input, output);
     }
-}
-void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
     switch(axis)
     {
@@ -476,13 +717,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -490,13 +731,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -504,13 +745,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -518,13 +759,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
             switch(input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
-                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                    return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
@@ -533,14 +774,6 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
     }
 }
 
-TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
-{
-    TensorShape output_shape{ input_shape };
-    output_shape.set(axis, 1);
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_UNUSED(op);
@@ -553,10 +786,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+        if(!is_arg_min_max)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+        }
+
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
 
-        const TensorShape output_shape         = calculate_output_shape(input->tensor_shape(), axis);
+        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
         const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
@@ -564,13 +806,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     // Calculate output shape and set if empty
-    const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
+    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    DataType   output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+    auto_init_if_empty(*output, output_shape, 1, output_data_type);
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
 
@@ -613,7 +857,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
     _reduction_axis = axis;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
 
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
@@ -623,7 +867,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
 Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
 
     return Status{};
 }
@@ -634,19 +878,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_op)
-    {
-        case ReductionOperation::SUM_SQUARE:
-            reduce_sumsq(window, _input, _output, _reduction_axis);
-            break;
-        case ReductionOperation::MEAN_SUM:
-            reduce_mean_sum(window, _input, _output, _reduction_axis);
-            break;
-        case ReductionOperation::SUM:
-            reduce_sum(window, _input, _output, _reduction_axis);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction operation.");
-    }
+    reduce_op(window, _input, _output, _reduction_axis, _op);
 }
 } // namespace arm_compute
author	Michalis Spyrou <michalis.spyrou@arm.com>	2019-01-03 11:10:25 +0000
committer	Michalis Spyrou <michalis.spyrou@arm.com>	2019-01-10 16:24:26 +0000
commit	aea14c63e2efeda9d5f7492099389d439c65204f (patch)
tree	176a6181bbf00e4df078d5da0a17dd44f248958e /src/core/NEON/kernels/NEReductionOperationKernel.cpp
parent	c10bc0b5db5169a6ccea02a1aaefe34f082709e5 (diff)
download	ComputeLibrary-aea14c63e2efeda9d5f7492099389d439c65204f.tar.gz