aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2019-01-03 11:10:25 +0000
committerMichalis Spyrou <michalis.spyrou@arm.com>2019-01-10 16:24:26 +0000
commitaea14c63e2efeda9d5f7492099389d439c65204f (patch)
tree176a6181bbf00e4df078d5da0a17dd44f248958e
parentc10bc0b5db5169a6ccea02a1aaefe34f082709e5 (diff)
downloadComputeLibrary-aea14c63e2efeda9d5f7492099389d439c65204f.tar.gz
COMPMID-1764 NEON: Implement ArgMax/ArgMin
Change-Id: Ibe23aa90b36ffd8553d1d1c35fada5d300fab829 Reviewed-on: https://review.mlplatform.org/475 Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/bsl.h (renamed from arm_compute/core/NEON/wrapper/intrinsics/bitselect.h)52
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/ceq.h64
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/cgt.h (renamed from arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h)42
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/clt.h64
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h9
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/orr.h60
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/pmax.h53
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/pmin.h53
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h15
-rw-r--r--arm_compute/runtime/NEON/NEFunctions.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h78
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp765
-rw-r--r--src/core/NEON/kernels/NESelectKernel.cpp14
-rw-r--r--src/runtime/NEON/functions/NEArgMinMaxLayer.cpp71
-rw-r--r--tests/validation/NEON/ArgMinMax.cpp167
-rw-r--r--tests/validation/fixtures/ArgMinMaxFixture.h60
-rw-r--r--tests/validation/fixtures/ReduceMeanFixture.h4
-rw-r--r--tests/validation/fixtures/ReductionOperationFixture.h4
-rw-r--r--tests/validation/reference/L2NormalizeLayer.cpp4
-rw-r--r--tests/validation/reference/ReductionOperation.cpp59
-rw-r--r--tests/validation/reference/ReductionOperation.h8
21 files changed, 1257 insertions, 390 deletions
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
index 8223f6d463..9831b4b842 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -14,15 +14,15 @@
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT SELECT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_WRAPPER_BITSELECT_H__
-#define __ARM_COMPUTE_WRAPPER_BITSELECT_H__
+#ifndef __ARM_COMPUTE_WRAPPER_BSL_H__
+#define __ARM_COMPUTE_WRAPPER_BSL_H__
#include <arm_neon.h>
@@ -30,35 +30,35 @@ namespace arm_compute
{
namespace wrapper
{
-#define VBITSELECT_IMPL(stype, vtype, ctype, prefix, postfix) \
- inline vtype vbitselect(const ctype &a, const vtype &b, const vtype &c) \
- { \
- return prefix##_##postfix(a, b, c); \
+#define VBSL_IMPL(vctype, vtype, prefix, postfix) \
+ inline vtype vbsl(const vctype &a, const vtype &b, const vtype &c) \
+ { \
+ return prefix##_##postfix(a, b, c); \
}
-VBITSELECT_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8)
-VBITSELECT_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8)
-VBITSELECT_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16)
-VBITSELECT_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16)
-VBITSELECT_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32)
-VBITSELECT_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32)
-VBITSELECT_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32)
+VBSL_IMPL(uint8x8_t, uint8x8_t, vbsl, u8)
+VBSL_IMPL(uint8x8_t, int8x8_t, vbsl, s8)
+VBSL_IMPL(uint16x4_t, uint16x4_t, vbsl, u16)
+VBSL_IMPL(uint16x4_t, int16x4_t, vbsl, s16)
+VBSL_IMPL(uint32x2_t, uint32x2_t, vbsl, u32)
+VBSL_IMPL(uint32x2_t, int32x2_t, vbsl, s32)
+VBSL_IMPL(uint32x2_t, float32x2_t, vbsl, f32)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBITSELECT_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16)
+VBSL_IMPL(uint16x4_t, float16x4_t, vbsl, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBITSELECT_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8)
-VBITSELECT_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8)
-VBITSELECT_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16)
-VBITSELECT_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16)
-VBITSELECT_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32)
-VBITSELECT_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32)
-VBITSELECT_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32)
+VBSL_IMPL(uint8x16_t, uint8x16_t, vbslq, u8)
+VBSL_IMPL(uint8x16_t, int8x16_t, vbslq, s8)
+VBSL_IMPL(uint16x8_t, uint16x8_t, vbslq, u16)
+VBSL_IMPL(uint16x8_t, int16x8_t, vbslq, s16)
+VBSL_IMPL(uint32x4_t, uint32x4_t, vbslq, u32)
+VBSL_IMPL(uint32x4_t, int32x4_t, vbslq, s32)
+VBSL_IMPL(uint32x4_t, float32x4_t, vbslq, f32)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBITSELECT_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16)
+VBSL_IMPL(uint16x8_t, float16x8_t, vbslq, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VBITSELECT_IMPL
+#undef VBSL_IMPL
} // namespace wrapper
} // namespace arm_compute
-#endif /* __ARM_COMPUTE_WRAPPER_BITSELECT_H__ */
+#endif /* __ARM_COMPUTE_WRAPPER_BSL_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
new file mode 100644
index 0000000000..812ac326a8
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_CEQ_H__
+#define __ARM_COMPUTE_WRAPPER_CEQ_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCEQ_IMPL(votype, vtype, prefix, postfix) \
+ inline votype vceq(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8)
+VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8)
+VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16)
+VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16)
+VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32)
+VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32)
+VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8)
+VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8)
+VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16)
+VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16)
+VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32)
+VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32)
+VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCEQ_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_CEQ_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
index 5ee7516a4e..c2ed9df1dc 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,32 +30,32 @@ namespace arm_compute
{
namespace wrapper
{
-#define VCGT_IMPL(stype, vtype, rtype, prefix, postfix) \
- inline rtype vgreaterthan(const vtype &a, const vtype &b) \
- { \
- return prefix##_##postfix(a, b); \
+#define VCGT_IMPL(votype, vtype, prefix, postfix) \
+ inline votype vcgt(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
}
-VCGT_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcgt, u8)
-VCGT_IMPL(int8_t, int8x8_t, uint8x8_t, vcgt, s8)
-VCGT_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcgt, u16)
-VCGT_IMPL(int16_t, int16x4_t, uint16x4_t, vcgt, s16)
-VCGT_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcgt, u32)
-VCGT_IMPL(int32_t, int32x2_t, uint32x2_t, vcgt, s32)
-VCGT_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcgt, f32)
+VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8)
+VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8)
+VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16)
+VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16)
+VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32)
+VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32)
+VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcgt, f16)
+VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgtq, u8)
-VCGT_IMPL(int8_t, int8x16_t, uint8x16_t, vcgtq, s8)
-VCGT_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgtq, u16)
-VCGT_IMPL(int16_t, int16x8_t, uint16x8_t, vcgtq, s16)
-VCGT_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgtq, u32)
-VCGT_IMPL(int32_t, int32x4_t, uint32x4_t, vcgtq, s32)
-VCGT_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgtq, f32)
+VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8)
+VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8)
+VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16)
+VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16)
+VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32)
+VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32)
+VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgtq, f16)
+VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#undef VCGT_IMPL
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
new file mode 100644
index 0000000000..a187c216d7
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_CLT_H__
+#define __ARM_COMPUTE_WRAPPER_CLT_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCLT_IMPL(votype, vtype, prefix, postfix) \
+ inline votype vclt(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8)
+VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8)
+VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16)
+VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16)
+VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32)
+VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32)
+VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8)
+VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8)
+VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16)
+VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16)
+VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32)
+VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32)
+VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCLT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_CLT_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index d00d3303f1..97af983e62 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -26,14 +26,16 @@
#include "arm_compute/core/NEON/wrapper/intrinsics/add.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/bitselect.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
@@ -44,7 +46,10 @@
#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/padd.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
new file mode 100644
index 0000000000..d82dc56a6d
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_ORR_H__
+#define __ARM_COMPUTE_WRAPPER_ORR_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VORR_IMPL(stype, vtype, prefix, postfix) \
+ inline vtype vorr(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VORR_IMPL(uint8_t, uint8x8_t, vorr, u8)
+VORR_IMPL(int8_t, int8x8_t, vorr, s8)
+VORR_IMPL(uint16_t, uint16x4_t, vorr, u16)
+VORR_IMPL(int16_t, int16x4_t, vorr, s16)
+VORR_IMPL(uint32_t, uint32x2_t, vorr, u32)
+VORR_IMPL(int32_t, int32x2_t, vorr, s32)
+VORR_IMPL(uint64_t, uint64x1_t, vorr, u64)
+VORR_IMPL(int64_t, int64x1_t, vorr, s64)
+
+VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8)
+VORR_IMPL(int8_t, int8x16_t, vorrq, s8)
+VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16)
+VORR_IMPL(int16_t, int16x8_t, vorrq, s16)
+VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32)
+VORR_IMPL(int32_t, int32x4_t, vorrq, s32)
+VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64)
+VORR_IMPL(int64_t, int64x2_t, vorrq, s64)
+
+#undef VORR_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_ORR_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
new file mode 100644
index 0000000000..7f701f89c4
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_PMAX_H__
+#define __ARM_COMPUTE_WRAPPER_PMAX_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPMAX_IMPL(stype, vtype, prefix, postfix) \
+ inline vtype vpmax(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8)
+VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8)
+VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16)
+VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16)
+VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32)
+VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32)
+VPMAX_IMPL(float, float32x2_t, vpmax, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPMAX_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_PMAX_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
new file mode 100644
index 0000000000..52d5eb17a0
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_PMIN_H__
+#define __ARM_COMPUTE_WRAPPER_PMIN_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPMIN_IMPL(stype, vtype, prefix, postfix) \
+ inline vtype vpmin(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8)
+VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8)
+VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16)
+VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16)
+VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32)
+VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32)
+VPMIN_IMPL(float, float32x2_t, vpmin, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPMIN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_PMIN_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 70727424b3..619234d306 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1033,6 +1033,21 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
return tiled_shape;
}
+/** Calculate the reduced shape of a tensor given an axis
+ *
+ * @param[in] input Input tensor info
+ * @param[in] axis Axis on which to perform reduction
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis)
+{
+ TensorShape output_shape{ input };
+ output_shape.set(axis, 1);
+
+ return output_shape;
+}
+
/** Calculate the upsampled shape of a tensor
*
* @param[in] input Input tensor info
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 2e94030e53..2daef70cef 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -28,6 +28,7 @@
#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
new file mode 100644
index 0000000000..87d77a5e13
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARGMINMAXLAYER_H__
+#define __ARM_COMPUTE_NEARGMINMAXLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class IsTensor;
+
+/** Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
+ * This function calls the following NEON kernels:
+ *
+ * -# @ref NEReductionOperationKernel
+ * -# @ref NEFillBorderKernel
+ *
+ */
+class NEArgMinMaxLayer : public IFunction
+{
+public:
+ /** Constructor */
+ NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Input source tensor. Data types supported: F16/F32.
+ * @param[in] axis Axis to find max/min index.
+ * @param[out] output Output source tensor. Data types supported: U32.
+ * @param[in] op Operation to perform: min or max
+ */
+ void configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMaxLayer
+ *
+ * @param[in] input Input source tensor info. Data types supported: F16/F32.
+ * @param[in] axis Axis to find max/min index.
+ * @param[in] output Output source tensor info. Data types supported: U32.
+ * @param[in] op Operation to perform: min or max
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ NEReductionOperationKernel _reduction_kernel;
+ NEFillBorderKernel _fill_border_kernel;
+ bool _run_fill_border;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEARGMINMAXLAYER_H__ */
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 9306e0303d..64e3cfe404 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,6 +31,7 @@
#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -39,11 +40,225 @@ namespace arm_compute
{
namespace
{
+uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4_t mask{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask = wrapper::vclt(b, a);
+ }
+
+ uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
+ if(axis != 0)
+ {
+ vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 };
+
+ return res;
+}
+
+uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4x4_t mask{ 0 };
+ uint8x16_t mask_u8{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask_u8 = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask_u8 = wrapper::vclt(b, a);
+ }
+ mask.val[0] = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+ mask.val[1] = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(mask_u8))));
+ mask.val[2] = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+ mask.val[3] = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(mask_u8))));
+ uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+ { idx + 4, idx + 5, idx + 6, idx + 7 },
+ { idx + 8, idx + 9, idx + 10, idx + 11 },
+ { idx + 12, idx + 13, idx + 14, idx + 15 }
+ }
+ };
+ if(axis != 0)
+ {
+ vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
+ vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+ vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
+ vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
+ };
+
+ return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+{
+ uint32x4_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ auto mask = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+ res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
+ }
+
+ res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
+ pmin = wrapper::vpmin(pmin, pmin);
+ uint32_t res = wrapper::vgetlane(pmin, 0);
+
+ return (res - 0xFFFFFFFF);
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op)
+{
+ uint32x4x4_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+ uint8x16_t mask_u8{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+ }
+
+ // Widen vectors
+ auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+ res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+ res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+ res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
+ res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
+ res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+ res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+ res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
+ res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
+
+ uint32_t res = 0xFFFFFFFF;
+ int iter = 0;
+ do
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+ pmin = wrapper::vpmin(pmin, pmin);
+ res = std::min(wrapper::vgetlane(pmin, 0), res);
+ iter++;
+ }
+ while(iter < 4);
+
+ return (res - 0xFFFFFFFF);
+}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+{
+ uint32x4x2_t mask{ 0 };
+ uint16x8_t mask_u16{ 0 };
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ mask_u16 = wrapper::vcgt(b, a);
+ }
+ else
+ {
+ mask_u16 = wrapper::vclt(b, a);
+ }
+ mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16));
+ mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16));
+ uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
+ { idx + 4, idx + 5, idx + 6, idx + 7 }
+ }
+ };
+ if(axis != 0)
+ {
+ vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+ wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
+ 0, 0
+ };
+
+ return res;
+}
+
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
+{
+ uint32x4x2_t res_idx_mask{ 0 };
+ uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
+ uint16x8_t mask_u16;
+ if(op == ReductionOperation::ARG_IDX_MIN)
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmin = wrapper::vpmin(pmin, pmin);
+ pmin = wrapper::vpmin(pmin, pmin);
+ mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
+ }
+ else
+ {
+ auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ pmax = wrapper::vpmax(pmax, pmax);
+ pmax = wrapper::vpmax(pmax, pmax);
+ mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
+ }
+
+ // Widen vectors
+ auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+ auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+ res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
+ res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
+ res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
+ res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
+
+ uint32_t res = 0xFFFFFFFF;
+ int iter = 0;
+ do
+ {
+ auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
+ pmin = wrapper::vpmin(pmin, pmin);
+ res = std::min(wrapper::vgetlane(pmin, 0), res);
+ iter++;
+ }
+ while(iter < 2);
+
+ return (res - 0xFFFFFFFF);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
template <class F>
class Reducer
{
public:
- static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set out window
Window out_window(window);
@@ -58,11 +273,11 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info());
+ f(in, out, in_slice, out_slice, *input->info(), op);
}
while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
}
- static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in window
Window in_window(window);
@@ -80,11 +295,11 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 1);
+ f(in, out, in_slice, out_slice, *input->info(), 1, op);
}
while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
}
- static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in window
Window in_window(window);
@@ -102,11 +317,11 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 2);
+ f(in, out, in_slice, out_slice, *input->info(), 2, op);
}
while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
}
- static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+ static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
{
// Set in/out window
Window in_window(window);
@@ -124,115 +339,205 @@ public:
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice, *input->info(), 3);
+ f(in, out, in_slice, out_slice, *input->info(), 3, op);
}
while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
}
};
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
struct RedOpX
{
/** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ auto init_res_value = static_cast<T>(0.f);
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ init_res_value = *reinterpret_cast<T *>(input.ptr());
+ }
+ auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+ uint32x4x4_t vec_res_idx{ 0 };
execute_window_loop(in_slice, [&](const Coordinates & id)
{
const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
const auto vec_elements = wrapper::vloadq(in_ptr);
- if(op == ReductionOperation::SUM_SQUARE)
- {
- vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
- }
- else
+ switch(op)
{
- vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
},
input);
- auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
- for(int i = 0; i < S / 4; ++i)
+ switch(op)
{
- carry_addition = wrapper::vpadd(carry_addition, carry_addition);
- }
+ case ReductionOperation::SUM:
+ case ReductionOperation::SUM_SQUARE:
+ case ReductionOperation::MEAN_SUM:
+ {
+ auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ for(int i = 0; i < S / 4; ++i)
+ {
+ carry_res = wrapper::vpadd(carry_res, carry_res);
+ }
+ auto res = wrapper::vgetlane(carry_res, 0);
- auto res = wrapper::vgetlane(carry_addition, 0);
- if(op == ReductionOperation::MEAN_SUM)
- {
- res /= in_info.dimension(0);
- }
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
- *(reinterpret_cast<T *>(output.ptr())) = res;
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
};
-template <ReductionOperation op>
struct RedOpX_qasymm8
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
- auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_res_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+
+ uint8x16_t vec_res_value;
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
+ }
+ uint32x4x4_t vec_res_idx{ 0 };
execute_window_loop(in_slice, [&](const Coordinates & id)
{
const auto vec_elements = wrapper::vloadq(input.ptr());
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
- vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
- vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
- vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
},
input);
- auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
- carry_addition = wrapper::vadd(carry_addition, vec_sum_value3);
- carry_addition = wrapper::vadd(carry_addition, vec_sum_value4);
-
- auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
- carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
- auto res = wrapper::vgetlane(carry_paddition, 0);
-
- if(op == ReductionOperation::MEAN_SUM)
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
- res /= in_info.dimension(0);
+ auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
}
+ else
+ {
+ auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+ carry_res = wrapper::vadd(carry_res, vec_res_value3);
+ carry_res = wrapper::vadd(carry_res, vec_res_value4);
- *(output.ptr()) = static_cast<uint8_t>(res);
+ auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = wrapper::vgetlane(carry_paddition, 0);
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
+
+ *(output.ptr()) = static_cast<uint8_t>(res);
+ }
}
};
-template <typename T, int S, ReductionOperation op>
+template <typename T, int S>
struct RedOpYZW
{
/** NEON vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+ using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ neon_vector vec_res_value;
+ if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN)
+ {
+ vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+ }
+ else
+ {
+ vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ }
+ uint32x4x4_t vec_res_idx{ 0 };
+
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
T *in_ptr;
@@ -252,41 +557,68 @@ struct RedOpYZW
}
const auto vec_elements = wrapper::vloadq(in_ptr);
- if(op == ReductionOperation::SUM_SQUARE)
+ switch(op)
{
- vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
- }
- else
- {
- vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
}
if(op == ReductionOperation::MEAN_SUM)
{
auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
- vec_sum_value = wrapper::vmul(vec_sum_value, vec_width_inv);
+ vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
}
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+ {
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+ }
+ else
+ {
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+ }
},
input, output);
}
};
-template <ReductionOperation op>
struct RedOpYZW_qasymm8
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
{
ARM_COMPUTE_UNUSED(out_slice);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
- auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ uint32x4x4_t vec_res_idx{ 0 };
+ auto vec_res_value1 = vdupq_n_u32(0);
+ auto vec_res_value2 = vdupq_n_u32(0);
+ auto vec_res_value3 = vdupq_n_u32(0);
+ auto vec_res_value4 = vdupq_n_u32(0);
+ auto vec_res_value = wrapper::vloadq(input.ptr());
+
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
uint8_t *in_ptr;
@@ -306,169 +638,78 @@ struct RedOpYZW_qasymm8
}
const auto vec_elements = wrapper::vloadq(in_ptr);
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
- vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
- vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
- vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
if(op == ReductionOperation::MEAN_SUM)
{
const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
- const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
- const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
- const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
- const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
-
- vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
- vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
- vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
- vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
- }
-
- const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
- const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
- auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
- wrapper::vstore(output.ptr(), res);
- },
- input, output);
- }
-};
-
-void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
- switch(axis)
- {
- case 0:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch(input->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
+ const auto vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv);
+ const auto vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv);
+ const auto vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv);
+ const auto vec_res_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv);
+
+ vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f);
+ vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f);
+ vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f);
+ vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f);
}
- case 2:
- switch(input->info()->data_type())
+ if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
}
- case 3:
- switch(input->info()->data_type())
+ else
{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
- case DataType::QASYMM8:
- default:
- ARM_COMPUTE_ERROR("Not supported");
+ const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+ wrapper::vstore(output.ptr(), res);
}
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
- }
-}
-void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
-{
- switch(axis)
- {
- case 0:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 2:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 3:
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ },
+ input, output);
}
-}
-void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+};
+
+void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
{
switch(axis)
{
@@ -476,13 +717,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -490,13 +731,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -504,13 +745,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -518,13 +759,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
switch(input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -533,14 +774,6 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output
}
}
-TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
-{
- TensorShape output_shape{ input_shape };
- output_shape.set(axis, 1);
-
- return output_shape;
-}
-
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_UNUSED(op);
@@ -553,10 +786,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+ if(!is_arg_min_max)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+ }
+
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
}
@@ -564,13 +806,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
return Status{};
}
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
// Calculate output shape and set if empty
- const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type());
+ const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+ DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type();
+ auto_init_if_empty(*output, output_shape, 1, output_data_type);
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
@@ -613,7 +857,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
_reduction_axis = axis;
// Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
+ auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
@@ -623,7 +867,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
return Status{};
}
@@ -634,19 +878,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_op)
- {
- case ReductionOperation::SUM_SQUARE:
- reduce_sumsq(window, _input, _output, _reduction_axis);
- break;
- case ReductionOperation::MEAN_SUM:
- reduce_mean_sum(window, _input, _output, _reduction_axis);
- break;
- case ReductionOperation::SUM:
- reduce_sum(window, _input, _output, _reduction_axis);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction operation.");
- }
+ reduce_op(window, _input, _output, _reduction_axis, _op);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 0c134c00ed..f2697bcc6d 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -14,9 +14,9 @@
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
@@ -67,7 +67,7 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen
const auto c = (*condition_conversion)(condition_ptr + x);
const auto a = wrapper::vloadq(input1_ptr + x);
const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, wrapper::vbitselect(c, a, b));
+ wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
}
for(; x < window_end_x; ++x)
{
@@ -90,7 +90,7 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT
select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
{
static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vgreaterthan(wrapper::vloadq(condition_ptr), zero);
+ return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
});
}
@@ -104,7 +104,7 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
{
static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vgreaterthan(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+ return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
});
}
@@ -118,7 +118,7 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr)
{
static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vgreaterthan(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+ return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
});
}
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
new file mode 100644
index 0000000000..d33e1342b9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+{
+}
+void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
+{
+ _reduction_kernel.configure(input, output, axis, op);
+
+ if(axis == 0)
+ {
+ _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
+ _run_fill_border = true;
+ }
+}
+
+Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+ return Status{};
+}
+
+void NEArgMinMaxLayer::run()
+{
+ _memory_group.acquire();
+
+ if(_run_fill_border)
+ {
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+
+ _memory_group.release();
+}
+
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
new file mode 100644
index 0000000000..611495a41d
--- /dev/null
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SplitDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ArgMinMaxFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(ArgMinMax)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis
+ TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape
+ TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),
+ TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) // Invalid operation
+ }),
+ framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
+ TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
+ TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::U32),
+ TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32)
+ })),
+ framework::dataset::make("Axis", { 4, 0, 2, 0 })),
+ framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::MEAN_SUM })),
+ framework::dataset::make("Expected", { false, false, true, false })),
+ input_info, output_info, axis, operation, expected)
+{
+ const Status status = NEArgMinMaxLayer::validate(&input_info.clone()->set_is_resizable(false), axis, &output_info.clone()->set_is_resizable(false), operation);
+ ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+DATA_TEST_CASE(Configuration,
+ framework::DatasetMode::ALL,
+ combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
+ shape, data_type)
+{
+ // Create tensors
+ Tensor ref_src = create_tensor<Tensor>(shape, data_type);
+ Tensor dst;
+
+ // Create and Configure function
+ NEArgMinMaxLayer arg_min_max_layer;
+ arg_min_max_layer.configure(&ref_src, 1, &dst, ReductionOperation::ARG_IDX_MAX);
+
+ // Validate valid region
+ TensorShape output_shape = shape;
+ output_shape.set(1, 1);
+ const ValidRegion valid_region = shape_to_valid_region(output_shape);
+ validate(dst.info()->valid_region(), valid_region);
+}
+
+template <typename T>
+using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+
+TEST_SUITE(Float)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+ NEArgMinMaxValidationFixture<half>,
+ framework::DatasetMode::PRECOMMIT,
+ combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+ NEArgMinMaxValidationFixture<half>,
+ framework::DatasetMode::NIGHTLY,
+ combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP16
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T>
+using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+ NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+ framework::DatasetMode::PRECOMMIT,
+ combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+ framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
+ framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+ NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+ framework::DatasetMode::NIGHTLY,
+ combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+ framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
+ framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+ NEArgMinMaxValidationFixture<float>,
+ framework::DatasetMode::PRECOMMIT,
+ combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+ NEArgMinMaxValidationFixture<float>,
+ framework::DatasetMode::NIGHTLY,
+ combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // ArgMinMax
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h
index 5f5f85c104..e263b25bf2 100644
--- a/tests/validation/fixtures/ArgMinMaxFixture.h
+++ b/tests/validation/fixtures/ArgMinMaxFixture.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,28 +42,38 @@ namespace test
namespace validation
{
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ArgMinMaxValidationFixture : public framework::Fixture
+class ArgMinMaxValidationBaseFixture : public framework::Fixture
{
public:
template <typename...>
- void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op)
+ void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
{
- _target = compute_target(shape, data_type, axis, op);
- _reference = compute_reference(shape, data_type, axis, op);
+ _target = compute_target(shape, data_type, axis, op, q_info);
+ _reference = compute_reference(shape, data_type, axis, op, q_info);
}
protected:
template <typename U>
void fill(U &&tensor)
{
- std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
- library->fill(tensor, distribution, 0);
+ if(!is_data_type_quantized(tensor.data_type()))
+ {
+ std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+ library->fill(tensor, distribution, 0);
+ }
+ else
+ {
+ std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+ std::uniform_int_distribution<uint8_t> distribution(bounds.first, bounds.second);
+
+ library->fill(tensor, distribution, 0);
+ }
}
- TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op)
+ TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
{
// Create tensors
- TensorType src = create_tensor<TensorType>(src_shape, data_type, 1);
+ TensorType src = create_tensor<TensorType>(src_shape, data_type, 1, q_info);
TensorType dst;
// Create and configure function
@@ -89,21 +99,43 @@ protected:
return dst;
}
- SimpleTensor<T> compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op)
+ SimpleTensor<uint32_t> compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info)
{
// Create reference
- SimpleTensor<T> src{ src_shape, data_type, 1 };
+ SimpleTensor<T> src{ src_shape, data_type, 1, q_info };
// Fill reference
fill(src);
TensorShape output_shape = src_shape;
output_shape.set(axis, 1);
- return reference::reduction_operation<T>(src, output_shape, axis, op);
+ return reference::reduction_operation<T, uint32_t>(src, output_shape, axis, op);
}
- TensorType _target{};
- SimpleTensor<T> _reference{};
+ TensorType _target{};
+ SimpleTensor<uint32_t> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArgMinMaxValidationQuantizedFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+ template <typename...>
+ void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo quantization_info)
+ {
+ ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, quantization_info);
+ }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArgMinMaxValidationFixture : public ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+ template <typename...>
+ void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op)
+ {
+ ArgMinMaxValidationBaseFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, op, QuantizationInfo());
+ }
};
} // namespace validation
} // namespace test
diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index 769d7f674f..44bb9fca6a 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -113,7 +113,7 @@ protected:
{
TensorShape output_shape = i == 0 ? src_shape : out.shape();
output_shape.set(axis[i], 1);
- out = reference::reduction_operation<T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM);
+ out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM);
}
if(!keep_dims)
diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h
index 9079b47cbb..d01f41abf0 100644
--- a/tests/validation/fixtures/ReductionOperationFixture.h
+++ b/tests/validation/fixtures/ReductionOperationFixture.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -107,7 +107,7 @@ protected:
// Fill reference
fill(src);
- return reference::reduction_operation<T>(src, dst_shape, axis, op);
+ return reference::reduction_operation<T, T>(src, dst_shape, axis, op);
}
TensorType _target{};
diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp
index fcd6226f07..43885b29e2 100644
--- a/tests/validation/reference/L2NormalizeLayer.cpp
+++ b/tests/validation/reference/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,7 +54,7 @@ SimpleTensor<T> l2_normalize(const SimpleTensor<T> &src, unsigned int axis, floa
SimpleTensor<T> dst{ src.shape(), src.data_type() };
// Reduce across given axis
- SimpleTensor<T> sum = reduction_operation(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE);
+ SimpleTensor<T> sum = reduction_operation<T, T>(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE);
// Compute reference
const int upper_dims = src.shape().total_size_upper(axis + 1);
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index 37a9be86c0..fc12e31d75 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,20 +49,20 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
uint32_t int_res = 0;
for(int i = 0; i < reduce_elements; ++i)
{
- auto elem = static_cast<uint32_t>(*(ptr + stride * i));
+ auto elem = *(ptr + stride * i);
switch(op)
{
case ReductionOperation::ARG_IDX_MIN:
- if(static_cast<uint32_t>(*(ptr + stride * static_cast<uint32_t>(res))) > elem)
+ if(*(ptr + stride * static_cast<uint32_t>(int_res)) > elem)
{
- res = static_cast<uint32_t>(i);
+ int_res = static_cast<uint32_t>(i);
}
break;
case ReductionOperation::ARG_IDX_MAX:
- if(static_cast<uint32_t>(*(ptr + stride * static_cast<uint32_t>(res))) < elem)
+ if(*(ptr + stride * static_cast<uint32_t>(int_res)) < elem)
{
- res = static_cast<uint32_t>(i);
+ int_res = static_cast<uint32_t>(i);
}
break;
case ReductionOperation::SUM_SQUARE:
@@ -122,13 +122,13 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
}
} // namespace
-template <typename T>
-SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+template <typename T, typename OT>
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
{
// Create reference
const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
DataType output_data_type = is_arg_min_max ? DataType::U32 : src.data_type();
- SimpleTensor<T> dst{ dst_shape, output_data_type, 1, src.quantization_info() };
+ SimpleTensor<OT> dst{ dst_shape, output_data_type, 1, src.quantization_info() };
const unsigned int src_width = src.shape().x();
const unsigned int src_height = src.shape().y();
const unsigned int src_depth = src.shape().z();
@@ -143,14 +143,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
for(unsigned int du = 0; du < upper_dims; ++du)
{
const T *src_row_ptr = src.data() + du * reduce_elems;
- if(is_arg_min_max)
- {
- dst[du] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, 1);
- }
- else
- {
- dst[du] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, 1);
- }
+ dst[du] = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1);
}
}
break;
@@ -164,15 +157,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
const int in_offset = du * src_height * src_width + x;
const int out_offset = du * src_width + x;
const T *src_row_ptr = src.data() + in_offset;
-
- if(is_arg_min_max)
- {
- dst[out_offset] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, src_width);
- }
- else
- {
- dst[out_offset] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, src_width);
- }
+ dst[out_offset] = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width);
}
}
}
@@ -189,15 +174,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
const int in_offset = du * src_depth * src_height * src_width + y * src_width + x;
const int out_offset = du * src_width * src_height + y * src_width + x;
const T *src_row_ptr = src.data() + in_offset;
-
- if(is_arg_min_max)
- {
- dst[out_offset] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, src_height * src_width);
- }
- else
- {
- dst[out_offset] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, src_height * src_width);
- }
+ dst[out_offset] = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_height * src_width);
}
}
}
@@ -217,14 +194,7 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
const int in_offset = du * src_batch * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
const int out_offset = du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
const T *src_row_ptr = src.data() + in_offset;
- if(is_arg_min_max)
- {
- dst[out_offset] = reduce_operation<T, uint32_t>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
- }
- else
- {
- dst[out_offset] = reduce_operation<T, T>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
- }
+ dst[out_offset] = reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
}
}
}
@@ -238,6 +208,9 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
return dst;
}
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
template SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 859b57aa7b..9f7050f551 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,10 +35,10 @@ namespace validation
{
namespace reference
{
-template <typename T>
-SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template <typename T, typename OT>
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
} // namespace reference
} // namespace validation
} // namespace test
} // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FLOOR_H__ */
+#endif /* __ARM_COMPUTE_TEST_REDUCTION_OPERATION_H__ */