aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEReductionOperationKernel.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2019-01-03 11:10:25 +0000
committerMichalis Spyrou <michalis.spyrou@arm.com>2019-01-10 16:24:26 +0000
commitaea14c63e2efeda9d5f7492099389d439c65204f (patch)
tree176a6181bbf00e4df078d5da0a17dd44f248958e /src/core/NEON/kernels/NEReductionOperationKernel.cpp
parentc10bc0b5db5169a6ccea02a1aaefe34f082709e5 (diff)
downloadComputeLibrary-aea14c63e2efeda9d5f7492099389d439c65204f.tar.gz
COMPMID-1764 NEON: Implement ArgMax/ArgMin
Change-Id: Ibe23aa90b36ffd8553d1d1c35fada5d300fab829 Reviewed-on: https://review.mlplatform.org/475 Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEReductionOperationKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp765
1 files changed, 498 insertions, 267 deletions
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 9306e0303d..64e3cfe404 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,6 +31,7 @@
#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -39,11 +40,225 @@ namespace arm_compute
{
namespace
{
+uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4_t mask{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask = wrapper::vclt(b, a);
+ }
+
+ uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
+ if(axis != 0)
+ {
+ vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 };
+
+ return res;
+}
+
+uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4x4_t mask{ 0 };
+ uint8x16_t mask_u8{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask_u8 = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask_u8 = wrapper::vclt(b, a);
+ }
+ mask.val[0] = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+ mask.val[1] = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+ mask.val[2] = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+ mask.val[3] = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+ uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+ { idx + 4, idx + 5, idx + 6, idx + 7 },
+ { idx + 8, idx + 9, idx + 10, idx + 11 },
+ { idx + 12, idx + 13, idx + 14, idx + 15 }
+ }
+ };
+ if(axis != 0)
+ {
+ vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
+ vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+ vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
+ vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
+ };
+
+ return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+{
+ uint32x4_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ auto mask = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+ res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+ }
+
+ res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+ pmin = wrapper::vpmin(pmin, pmin);
+ uint32_t res = wrapper::vgetlane(pmin, 0);
+
+ return (res - 0xFFFFFFFF);
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op)
+{
+ uint32x4x4_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+ uint8x16_t mask_u8{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+ }
+
+ // Widen vectors
+ auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+ res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+ res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+ res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+ res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+ res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+ res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+ res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+ res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+ uint32_t res = 0xFFFFFFFF;
+ int iter = 0;
+ do
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+ pmin = wrapper::vpmin(pmin, pmin);
+ res = std::min(wrapper::vgetlane(pmin, 0), res);
+ iter++;
+ }
+ while(iter < 4);
+
+ return (res - 0xFFFFFFFF);
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4x2_t mask{ 0 };
+ uint16x8_t mask_u16{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask_u16 = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask_u16 = wrapper::vclt(b, a);
+ }
+ mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+ mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+ uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+ { idx + 4, idx + 5, idx + 6, idx + 7 }
+ }
+ };
+ if(axis != 0)
+ {
+ vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+ wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
+ 0, 0
+ };
+
+ return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+ uint32x4x2_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+ uint16x8_t mask_u16;
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+ }
+
+ // Widen vectors
+ auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+ auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+ res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+ res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+ res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+ res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+ uint32_t res = 0xFFFFFFFF;
+ int iter = 0;
+ do
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+ pmin = wrapper::vpmin(pmin, pmin);
+ res = std::min(wrapper::vgetlane(pmin, 0), res);
+ iter++;
+ }
+ while(iter < 2);
+
+ return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
template <class F>
class Reducer
{
public:
- static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set out window
Window out_window(window);
@@ -58,11 +273,11 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info());
+ f(in, out, in_slice, out_slice, *input->info(), op);
}
while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
}
- static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in window
Window in_window(window);
@@ -80,11 +295,11 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 1);
+ f(in, out, in_slice, out_slice, *input->info(), 1, op);
}
while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
}
- static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in window
Window in_window(window);
@@ -102,11 +317,11 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 2);
+ f(in, out, in_slice, out_slice, *input->info(), 2, op);
}
while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
}
- static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in/out window
Window in_window(window);
@@ -124,115 +339,205 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 3);
+ f(in, out, in_slice, out_slice, *input->info(), 3, op);
}
while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
}
};
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
struct RedOpX
{
/** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ auto init_res_value = static_cast<T>(0.f);
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ init_res_value = *reinterpret_cast<T *>(input.ptr());
+ }
+ auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+ uint32x4x4_t vec_res_idx{ 0 };
execute_window_loop(in_slice, [&](const Coordinates & id)
{
const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
const auto vec_elements = wrapper::vloadq(in_ptr);
- if(op == ReductionOperation::SUM_SQUARE)
- {
- vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
- }
- else
+ switch(op)
{
- vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
},
input);
- auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
- for(int i = 0; i < S / 4; ++i)
+ switch(op)
{
- carry_addition = wrapper::vpadd(carry_addition, carry_addition);
- }
+ case ReductionOperation::SUM:
+ case ReductionOperation::SUM_SQUARE:
+ case ReductionOperation::MEAN_SUM:
+ {
+ auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ for(int i = 0; i < S / 4; ++i)
+ {
+ carry_res = wrapper::vpadd(carry_res, carry_res);
+ }
+ auto res = wrapper::vgetlane(carry_res, 0);
- auto res = wrapper::vgetlane(carry_addition, 0);
- if(op == ReductionOperation::MEAN_SUM)
- {
- res /= in_info.dimension(0);
- }
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
- *(reinterpret_cast<T *>(output.ptr())) = res;
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
};
-template <ReductionOperation op>
struct RedOpX_qasymm8
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+
+ uint8x16_t vec_res_value;
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t vec_res_idx{ 0 };
execute_window_loop(in_slice, [&](const Coordinates & id)
{
const auto vec_elements = wrapper::vloadq(input.ptr());
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
- vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
- vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
- vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
},
input);
- auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
- carry_addition = wrapper::vadd(carry_addition, vec_sum_value3);
- carry_addition = wrapper::vadd(carry_addition, vec_sum_value4);
-
- auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
- carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
- auto res = wrapper::vgetlane(carry_paddition, 0);
-
- if(op == ReductionOperation::MEAN_SUM)
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
- res /= in_info.dimension(0);
+ auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
}
+ else
+ {
+ auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+ carry_res = wrapper::vadd(carry_res, vec_res_value3);
+ carry_res = wrapper::vadd(carry_res, vec_res_value4);
- *(output.ptr()) = static_cast<uint8_t>(res);
+ auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = wrapper::vgetlane(carry_paddition, 0);
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
+
+ *(output.ptr()) = static_cast<uint8_t>(res);
+ }
}
};
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
struct RedOpYZW
{
/** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ neon_vector vec_res_value;
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+ }
+ else
+ {
+ vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ }
+ uint32x4x4_t vec_res_idx{ 0 };
+
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
T *in_ptr;
@@ -252,41 +557,68 @@ struct RedOpYZW
}
const auto vec_elements = wrapper::vloadq(in_ptr);
- if(op == ReductionOperation::SUM_SQUARE)
+ switch(op)
{
- vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
- }
- else
- {
- vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
}
if(op == ReductionOperation::MEAN_SUM)
{
auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
- vec_sum_value = wrapper::vmul(vec_sum_value, vec_width_inv);
+ vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
}
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+ {
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+ }
+ else
+ {
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+ }
},
input, output);
}
};
-template <ReductionOperation op>
struct RedOpYZW_qasymm8
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ uint32x4x4_t vec_res_idx{ 0 };
+ auto vec_res_value1 = vdupq_n_u32(0);
+ auto vec_res_value2 = vdupq_n_u32(0);
+ auto vec_res_value3 = vdupq_n_u32(0);
+ auto vec_res_value4 = vdupq_n_u32(0);
+ auto vec_res_value = wrapper::vloadq(input.ptr());
+
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
uint8_t *in_ptr;
@@ -306,169 +638,78 @@ struct RedOpYZW_qasymm8
}
const auto vec_elements = wrapper::vloadq(in_ptr);
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
- vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
- vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
- vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
if(op == ReductionOperation::MEAN_SUM)
{
const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
- const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
- const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
- const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
- const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
-
- vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
- vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
- vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
- vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
- }
-
- const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
- const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
- auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
- wrapper::vstore(output.ptr(), res);
- },
- input, output);
- }
-};
-
-void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
- switch(axis)
- {
- case 0:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
+ const auto vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv);
+ const auto vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv);
+ const auto vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv);
+ const auto vec_res_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv);
+
+ vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+ vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+ vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+ vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
}
- case 2:
- switch(input->info()->data_type())
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
}
- case 3:
- switch(input->info()->data_type())
+ else
{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
+ const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+ wrapper::vstore(output.ptr(), res);
}
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
- }
-}
-void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
- switch(axis)
- {
- case 0:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 2:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 3:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ },
+ input, output);
}
-}
-void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
{
switch(axis)
{
@@ -476,13 +717,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -490,13 +731,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -504,13 +745,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -518,13 +759,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -533,14 +774,6 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
}
}
-TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
-{
- TensorShape output_shape{ input_shape };
- output_shape.set(axis, 1);
-
- return output_shape;
-}
-
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_UNUSED(op);
@@ -553,10 +786,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+ if(!is_arg_min_max)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+ }
+
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
}
@@ -564,13 +806,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
// Calculate output shape and set if empty
- const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type());
+ const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+ DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+ auto_init_if_empty(*output, output_shape, 1, output_data_type);
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
@@ -613,7 +857,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
_reduction_axis = axis;
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -623,7 +867,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
return Status{};
}
@@ -634,19 +878,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_op)
- {
- case ReductionOperation::SUM_SQUARE:
- reduce_sumsq(window, _input, _output, _reduction_axis);
- break;
- case ReductionOperation::MEAN_SUM:
- reduce_mean_sum(window, _input, _output, _reduction_axis);
- break;
- case ReductionOperation::SUM:
- reduce_sum(window, _input, _output, _reduction_axis);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction operation.");
- }
+ reduce_op(window, _input, _output, _reduction_axis, _op);
}
} // namespace arm_compute