From aea14c63e2efeda9d5f7492099389d439c65204f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 3 Jan 2019 11:10:25 +0000 Subject: COMPMID-1764 NEON: Implement ArgMax/ArgMin Change-Id: Ibe23aa90b36ffd8553d1d1c35fada5d300fab829 Reviewed-on: https://review.mlplatform.org/475 Reviewed-by: Isabella Gottardi Tested-by: Arm Jenkins Reviewed-by: Giuseppe Rossini --- .../core/NEON/wrapper/intrinsics/bitselect.h | 64 -- arm_compute/core/NEON/wrapper/intrinsics/bsl.h | 64 ++ arm_compute/core/NEON/wrapper/intrinsics/ceq.h | 64 ++ arm_compute/core/NEON/wrapper/intrinsics/cgt.h | 64 ++ arm_compute/core/NEON/wrapper/intrinsics/clt.h | 64 ++ .../core/NEON/wrapper/intrinsics/greaterthan.h | 64 -- .../core/NEON/wrapper/intrinsics/intrinsics.h | 9 +- arm_compute/core/NEON/wrapper/intrinsics/orr.h | 60 ++ arm_compute/core/NEON/wrapper/intrinsics/pmax.h | 53 ++ arm_compute/core/NEON/wrapper/intrinsics/pmin.h | 53 ++ arm_compute/core/utils/misc/ShapeCalculator.h | 15 + arm_compute/runtime/NEON/NEFunctions.h | 1 + .../runtime/NEON/functions/NEArgMinMaxLayer.h | 78 +++ .../NEON/kernels/NEReductionOperationKernel.cpp | 765 ++++++++++++++------- src/core/NEON/kernels/NESelectKernel.cpp | 14 +- src/runtime/NEON/functions/NEArgMinMaxLayer.cpp | 71 ++ tests/validation/NEON/ArgMinMax.cpp | 167 +++++ tests/validation/fixtures/ArgMinMaxFixture.h | 60 +- tests/validation/fixtures/ReduceMeanFixture.h | 4 +- .../fixtures/ReductionOperationFixture.h | 4 +- tests/validation/reference/L2NormalizeLayer.cpp | 4 +- tests/validation/reference/ReductionOperation.cpp | 59 +- tests/validation/reference/ReductionOperation.h | 8 +- 23 files changed, 1338 insertions(+), 471 deletions(-) delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/bitselect.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/bsl.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/ceq.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/cgt.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/clt.h delete mode 100644 arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/orr.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pmax.h create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/pmin.h create mode 100644 arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h create mode 100644 src/runtime/NEON/functions/NEArgMinMaxLayer.cpp create mode 100644 tests/validation/NEON/ArgMinMax.cpp diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h b/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h deleted file mode 100644 index 8223f6d463..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/bitselect.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT SELECT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_WRAPPER_BITSELECT_H__ -#define __ARM_COMPUTE_WRAPPER_BITSELECT_H__ - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VBITSELECT_IMPL(stype, vtype, ctype, prefix, postfix) \ - inline vtype vbitselect(const ctype &a, const vtype &b, const vtype &c) \ - { \ - return prefix##_##postfix(a, b, c); \ - } - -VBITSELECT_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8) -VBITSELECT_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8) -VBITSELECT_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16) -VBITSELECT_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16) -VBITSELECT_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32) -VBITSELECT_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32) -VBITSELECT_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBITSELECT_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VBITSELECT_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8) -VBITSELECT_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8) -VBITSELECT_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16) -VBITSELECT_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16) -VBITSELECT_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32) -VBITSELECT_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32) -VBITSELECT_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBITSELECT_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VBITSELECT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* __ARM_COMPUTE_WRAPPER_BITSELECT_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h new file mode 100644 index 0000000000..9831b4b842 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_BSL_H__ +#define __ARM_COMPUTE_WRAPPER_BSL_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VBSL_IMPL(vctype, vtype, prefix, postfix) \ + inline vtype vbsl(const vctype &a, const vtype &b, const vtype &c) \ + { \ + return prefix##_##postfix(a, b, c); \ + } + +VBSL_IMPL(uint8x8_t, uint8x8_t, vbsl, u8) +VBSL_IMPL(uint8x8_t, int8x8_t, vbsl, s8) +VBSL_IMPL(uint16x4_t, uint16x4_t, vbsl, u16) +VBSL_IMPL(uint16x4_t, int16x4_t, vbsl, s16) +VBSL_IMPL(uint32x2_t, uint32x2_t, vbsl, u32) +VBSL_IMPL(uint32x2_t, int32x2_t, vbsl, s32) +VBSL_IMPL(uint32x2_t, float32x2_t, vbsl, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VBSL_IMPL(uint16x4_t, float16x4_t, vbsl, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VBSL_IMPL(uint8x16_t, uint8x16_t, vbslq, u8) +VBSL_IMPL(uint8x16_t, int8x16_t, vbslq, s8) +VBSL_IMPL(uint16x8_t, uint16x8_t, vbslq, u16) +VBSL_IMPL(uint16x8_t, int16x8_t, vbslq, s16) +VBSL_IMPL(uint32x4_t, uint32x4_t, vbslq, u32) +VBSL_IMPL(uint32x4_t, int32x4_t, vbslq, s32) +VBSL_IMPL(uint32x4_t, float32x4_t, vbslq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VBSL_IMPL(uint16x8_t, float16x8_t, vbslq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VBSL_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_BSL_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h new file mode 100644 index 0000000000..812ac326a8 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_CEQ_H__ +#define __ARM_COMPUTE_WRAPPER_CEQ_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCEQ_IMPL(votype, vtype, prefix, postfix) \ + inline votype vceq(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8) +VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8) +VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16) +VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16) +VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32) +VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32) +VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8) +VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8) +VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16) +VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16) +VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32) +VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32) +VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCEQ_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_CEQ_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h new file mode 100644 index 0000000000..c2ed9df1dc --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_CGT_H__ +#define __ARM_COMPUTE_WRAPPER_CGT_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCGT_IMPL(votype, vtype, prefix, postfix) \ + inline votype vcgt(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8) +VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8) +VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16) +VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16) +VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32) +VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32) +VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8) +VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8) +VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16) +VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16) +VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32) +VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32) +VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCGT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_CGT_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h new file mode 100644 index 0000000000..a187c216d7 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/clt.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_CLT_H__ +#define __ARM_COMPUTE_WRAPPER_CLT_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VCLT_IMPL(votype, vtype, prefix, postfix) \ + inline votype vclt(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8) +VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8) +VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16) +VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16) +VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32) +VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32) +VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8) +VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8) +VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16) +VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16) +VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32) +VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32) +VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCLT_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_CLT_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h b/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h deleted file mode 100644 index 5ee7516a4e..0000000000 --- a/arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_WRAPPER_CGT_H__ -#define __ARM_COMPUTE_WRAPPER_CGT_H__ - -#include - -namespace arm_compute -{ -namespace wrapper -{ -#define VCGT_IMPL(stype, vtype, rtype, prefix, postfix) \ - inline rtype vgreaterthan(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ - } - -VCGT_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcgt, u8) -VCGT_IMPL(int8_t, int8x8_t, uint8x8_t, vcgt, s8) -VCGT_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcgt, u16) -VCGT_IMPL(int16_t, int16x4_t, uint16x4_t, vcgt, s16) -VCGT_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcgt, u32) -VCGT_IMPL(int32_t, int32x2_t, uint32x2_t, vcgt, s32) -VCGT_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcgt, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCGT_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcgt, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -VCGT_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgtq, u8) -VCGT_IMPL(int8_t, int8x16_t, uint8x16_t, vcgtq, s8) -VCGT_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgtq, u16) -VCGT_IMPL(int16_t, int16x8_t, uint16x8_t, vcgtq, s16) -VCGT_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgtq, u32) -VCGT_IMPL(int32_t, int32x4_t, uint32x4_t, vcgtq, s32) -VCGT_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgtq, f32) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VCGT_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgtq, f16) -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -#undef VCGT_IMPL -} // namespace wrapper -} // namespace arm_compute -#endif /* __ARM_COMPUTE_WRAPPER_CGT_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h index d00d3303f1..97af983e62 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h @@ -26,14 +26,16 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/add.h" #include "arm_compute/core/NEON/wrapper/intrinsics/and.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/bitselect.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h" #include "arm_compute/core/NEON/wrapper/intrinsics/combine.h" #include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h" #include "arm_compute/core/NEON/wrapper/intrinsics/exp.h" #include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h" #include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h" #include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h" -#include "arm_compute/core/NEON/wrapper/intrinsics/greaterthan.h" #include "arm_compute/core/NEON/wrapper/intrinsics/inv.h" #include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h" #include "arm_compute/core/NEON/wrapper/intrinsics/load.h" @@ -44,7 +46,10 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/movn.h" #include "arm_compute/core/NEON/wrapper/intrinsics/mul.h" #include "arm_compute/core/NEON/wrapper/intrinsics/neg.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h" #include "arm_compute/core/NEON/wrapper/intrinsics/padd.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h" #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h" #include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h" #include "arm_compute/core/NEON/wrapper/intrinsics/store.h" diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h new file mode 100644 index 0000000000..d82dc56a6d --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/orr.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_ORR_H__ +#define __ARM_COMPUTE_WRAPPER_ORR_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VORR_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vorr(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VORR_IMPL(uint8_t, uint8x8_t, vorr, u8) +VORR_IMPL(int8_t, int8x8_t, vorr, s8) +VORR_IMPL(uint16_t, uint16x4_t, vorr, u16) +VORR_IMPL(int16_t, int16x4_t, vorr, s16) +VORR_IMPL(uint32_t, uint32x2_t, vorr, u32) +VORR_IMPL(int32_t, int32x2_t, vorr, s32) +VORR_IMPL(uint64_t, uint64x1_t, vorr, u64) +VORR_IMPL(int64_t, int64x1_t, vorr, s64) + +VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8) +VORR_IMPL(int8_t, int8x16_t, vorrq, s8) +VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16) +VORR_IMPL(int16_t, int16x8_t, vorrq, s16) +VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32) +VORR_IMPL(int32_t, int32x4_t, vorrq, s32) +VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64) +VORR_IMPL(int64_t, int64x2_t, vorrq, s64) + +#undef VORR_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_ORR_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h new file mode 100644 index 0000000000..7f701f89c4 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_PMAX_H__ +#define __ARM_COMPUTE_WRAPPER_PMAX_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VPMAX_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vpmax(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8) +VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8) +VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16) +VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16) +VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32) +VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32) +VPMAX_IMPL(float, float32x2_t, vpmax, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VPMAX_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_PMAX_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h new file mode 100644 index 0000000000..52d5eb17a0 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_PMIN_H__ +#define __ARM_COMPUTE_WRAPPER_PMIN_H__ + +#include + +namespace arm_compute +{ +namespace wrapper +{ +#define VPMIN_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vpmin(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8) +VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8) +VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16) +VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16) +VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32) +VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32) +VPMIN_IMPL(float, float32x2_t, vpmin, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VPMIN_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_PMIN_H__ */ diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 70727424b3..619234d306 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -1033,6 +1033,21 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul return tiled_shape; } +/** Calculate the reduced shape of a tensor given an axis + * + * @param[in] input Input tensor info + * @param[in] axis Axis on which to perform reduction + * + * @return the calculated shape + */ +inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis) +{ + TensorShape output_shape{ input }; + output_shape.set(axis, 1); + + return output_shape; +} + /** Calculate the upsampled shape of a tensor * * @param[in] input Input tensor info diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index 2e94030e53..2daef70cef 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -28,6 +28,7 @@ #include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h" #include "arm_compute/runtime/NEON/functions/NEAccumulate.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h" diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h new file mode 100644 index 0000000000..87d77a5e13 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEARGMINMAXLAYER_H__ +#define __ARM_COMPUTE_NEARGMINMAXLAYER_H__ + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class IsTensor; + +/** Function to calculate the index of the minimum or maximum values in a tensor based on an axis. + * This function calls the following NEON kernels: + * + * -# @ref NEReductionOperationKernel + * -# @ref NEFillBorderKernel + * + */ +class NEArgMinMaxLayer : public IFunction +{ +public: + /** Constructor */ + NEArgMinMaxLayer(std::shared_ptr memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in] input Input source tensor. Data types supported: F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[out] output Output source tensor. Data types supported: U32. + * @param[in] op Operation to perform: min or max + */ + void configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op); + /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMaxLayer + * + * @param[in] input Input source tensor info. Data types supported: F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[in] output Output source tensor info. Data types supported: U32. + * @param[in] op Operation to perform: min or max + * + * @return a status + */ + static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEReductionOperationKernel _reduction_kernel; + NEFillBorderKernel _fill_border_kernel; + bool _run_fill_border; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEARGMINMAXLAYER_H__ */ diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 9306e0303d..64e3cfe404 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -31,6 +31,7 @@ #include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/NEON/wrapper/wrapper.h" #include @@ -39,11 +40,225 @@ namespace arm_compute { namespace { +uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4_t mask{ 0 }; + if(op == ReductionOperation::ARG_IDX_MIN) + { + mask = wrapper::vcgt(b, a); + } + else + { + mask = wrapper::vclt(b, a); + } + + uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 }; + if(axis != 0) + { + vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 }; + + return res; +} + +uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4x4_t mask{ 0 }; + uint8x16_t mask_u8{ 0 }; + if(op == ReductionOperation::ARG_IDX_MIN) + { + mask_u8 = wrapper::vcgt(b, a); + } + else + { + mask_u8 = wrapper::vclt(b, a); + } + mask.val[0] = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(mask_u8)))); + mask.val[1] = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(mask_u8)))); + mask.val[2] = wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(mask_u8)))); + mask.val[3] = wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(mask_u8)))); + uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, + { idx + 4, idx + 5, idx + 6, idx + 7 }, + { idx + 8, idx + 9, idx + 10, idx + 11 }, + { idx + 12, idx + 13, idx + 14, idx + 15 } + } + }; + if(axis != 0) + { + vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = { vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), + vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), + vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), + vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3]) + }; + + return res; +} + +uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op) +{ + uint32x4_t res_idx_mask{ 0 }; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + + if(op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + pmin = wrapper::vpmin(pmin, pmin); + auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); + } + else + { + auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + pmax = wrapper::vpmax(pmax, pmax); + auto mask = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax)); + res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); + } + + res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones); + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask)); + pmin = wrapper::vpmin(pmin, pmin); + uint32_t res = wrapper::vgetlane(pmin, 0); + + return (res - 0xFFFFFFFF); +} + +uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_value, ReductionOperation op) +{ + uint32x4x4_t res_idx_mask{ 0 }; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + uint8x16_t mask_u8{ 0 }; + if(op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + } + else + { + auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + } + + // Widen vectors + auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); + res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); + res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); + res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4); + res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); + res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); + res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones); + res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones); + + uint32_t res = 0xFFFFFFFF; + int iter = 0; + do + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); + pmin = wrapper::vpmin(pmin, pmin); + res = std::min(wrapper::vgetlane(pmin, 0), res); + iter++; + } + while(iter < 4); + + return (res - 0xFFFFFFFF); +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4x2_t mask{ 0 }; + uint16x8_t mask_u16{ 0 }; + if(op == ReductionOperation::ARG_IDX_MIN) + { + mask_u16 = wrapper::vcgt(b, a); + } + else + { + mask_u16 = wrapper::vclt(b, a); + } + mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); + mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); + uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, + { idx + 4, idx + 5, idx + 6, idx + 7 } + } + }; + if(axis != 0) + { + vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), + wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), + 0, 0 + }; + + return res; +} + +uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) +{ + uint32x4x2_t res_idx_mask{ 0 }; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + uint16x8_t mask_u16; + if(op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + } + else + { + auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + } + + // Widen vectors + auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); + auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); + res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); + res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); + res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); + res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); + + uint32_t res = 0xFFFFFFFF; + int iter = 0; + do + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); + pmin = wrapper::vpmin(pmin, pmin); + res = std::min(wrapper::vgetlane(pmin, 0), res); + iter++; + } + while(iter < 2); + + return (res - 0xFFFFFFFF); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + template class Reducer { public: - static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f) + static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) { // Set out window Window out_window(window); @@ -58,11 +273,11 @@ public: Iterator in(input, in_slice); Iterator out(output, out_slice); - f(in, out, in_slice, out_slice, *input->info()); + f(in, out, in_slice, out_slice, *input->info(), op); } while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); } - static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f) + static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) { // Set in window Window in_window(window); @@ -80,11 +295,11 @@ public: Iterator in(input, in_slice); Iterator out(output, out_slice); - f(in, out, in_slice, out_slice, *input->info(), 1); + f(in, out, in_slice, out_slice, *input->info(), 1, op); } while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); } - static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f) + static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) { // Set in window Window in_window(window); @@ -102,11 +317,11 @@ public: Iterator in(input, in_slice); Iterator out(output, out_slice); - f(in, out, in_slice, out_slice, *input->info(), 2); + f(in, out, in_slice, out_slice, *input->info(), 2, op); } while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice)); } - static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f) + static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) { // Set in/out window Window in_window(window); @@ -124,115 +339,205 @@ public: Iterator in(input, in_slice); Iterator out(output, out_slice); - f(in, out, in_slice, out_slice, *input->info(), 3); + f(in, out, in_slice, out_slice, *input->info(), 3, op); } while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice)); } }; -template +template struct RedOpX { /** NEON vector tag type. */ using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info) + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op) { ARM_COMPUTE_UNUSED(out_slice); - auto vec_sum_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + auto init_res_value = static_cast(0.f); + if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN) + { + init_res_value = *reinterpret_cast(input.ptr()); + } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + uint32x4x4_t vec_res_idx{ 0 }; execute_window_loop(in_slice, [&](const Coordinates & id) { const auto in_ptr = reinterpret_cast(input.ptr()); const auto vec_elements = wrapper::vloadq(in_ptr); - if(op == ReductionOperation::SUM_SQUARE) - { - vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value); - } - else + switch(op) { - vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value); + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } }, input); - auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value)); - for(int i = 0; i < S / 4; ++i) + switch(op) { - carry_addition = wrapper::vpadd(carry_addition, carry_addition); - } + case ReductionOperation::SUM: + case ReductionOperation::SUM_SQUARE: + case ReductionOperation::MEAN_SUM: + { + auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + for(int i = 0; i < S / 4; ++i) + { + carry_res = wrapper::vpadd(carry_res, carry_res); + } + auto res = wrapper::vgetlane(carry_res, 0); - auto res = wrapper::vgetlane(carry_addition, 0); - if(op == ReductionOperation::MEAN_SUM) - { - res /= in_info.dimension(0); - } + if(op == ReductionOperation::MEAN_SUM) + { + res /= in_info.dimension(0); + } - *(reinterpret_cast(output.ptr())) = res; + *(reinterpret_cast(output.ptr())) = res; + break; + } + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: + { + auto res = calculate_vector_index(vec_res_idx, vec_res_value, op); + *(reinterpret_cast(output.ptr())) = res; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } } }; -template struct RedOpX_qasymm8 { - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info) + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op) { ARM_COMPUTE_UNUSED(out_slice); - auto vec_sum_value1 = vdupq_n_u32(static_cast(0.f)); - auto vec_sum_value2 = vdupq_n_u32(static_cast(0.f)); - auto vec_sum_value3 = vdupq_n_u32(static_cast(0.f)); - auto vec_sum_value4 = vdupq_n_u32(static_cast(0.f)); + auto vec_res_value1 = vdupq_n_u32(static_cast(0.f)); + auto vec_res_value2 = vdupq_n_u32(static_cast(0.f)); + auto vec_res_value3 = vdupq_n_u32(static_cast(0.f)); + auto vec_res_value4 = vdupq_n_u32(static_cast(0.f)); + + uint8x16_t vec_res_value; + if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN) + { + vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t vec_res_idx{ 0 }; execute_window_loop(in_slice, [&](const Coordinates & id) { const auto vec_elements = wrapper::vloadq(input.ptr()); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1); - vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2); - vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3); - vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4); + switch(op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } }, input); - auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2); - carry_addition = wrapper::vadd(carry_addition, vec_sum_value3); - carry_addition = wrapper::vadd(carry_addition, vec_sum_value4); - - auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition)); - carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); - auto res = wrapper::vgetlane(carry_paddition, 0); - - if(op == ReductionOperation::MEAN_SUM) + if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) { - res /= in_info.dimension(0); + auto res = calculate_vector_index(vec_res_idx, vec_res_value, op); + *(reinterpret_cast(output.ptr())) = res; } + else + { + auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); + carry_res = wrapper::vadd(carry_res, vec_res_value3); + carry_res = wrapper::vadd(carry_res, vec_res_value4); - *(output.ptr()) = static_cast(res); + auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); + carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); + auto res = wrapper::vgetlane(carry_paddition, 0); + + if(op == ReductionOperation::MEAN_SUM) + { + res /= in_info.dimension(0); + } + + *(output.ptr()) = static_cast(res); + } } }; -template +template struct RedOpYZW { /** NEON vector tag type. */ using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + using neon_vector = typename wrapper::traits::neon_vector::type; - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis) + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op) { ARM_COMPUTE_UNUSED(out_slice); execute_window_loop(in_slice, [&](const Coordinates & id) { - auto vec_sum_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + neon_vector vec_res_value; + if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN) + { + vec_res_value = wrapper::vloadq(reinterpret_cast(input.ptr())); + } + else + { + vec_res_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + } + uint32x4x4_t vec_res_idx{ 0 }; + for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) { T *in_ptr; @@ -252,41 +557,68 @@ struct RedOpYZW } const auto vec_elements = wrapper::vloadq(in_ptr); - if(op == ReductionOperation::SUM_SQUARE) + switch(op) { - vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value); - } - else - { - vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value); + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } } if(op == ReductionOperation::MEAN_SUM) { auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), ExactTagType{})); - vec_sum_value = wrapper::vmul(vec_sum_value, vec_width_inv); + vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); } - wrapper::vstore(reinterpret_cast(output.ptr()), vec_sum_value); + if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + wrapper::vstore(reinterpret_cast(output.ptr()), vec_res_idx.val[0]); + } + else + { + wrapper::vstore(reinterpret_cast(output.ptr()), vec_res_value); + } }, input, output); } }; -template struct RedOpYZW_qasymm8 { - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis) + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op) { ARM_COMPUTE_UNUSED(out_slice); execute_window_loop(in_slice, [&](const Coordinates & id) { - auto vec_sum_value1 = vdupq_n_u32(static_cast(0.f)); - auto vec_sum_value2 = vdupq_n_u32(static_cast(0.f)); - auto vec_sum_value3 = vdupq_n_u32(static_cast(0.f)); - auto vec_sum_value4 = vdupq_n_u32(static_cast(0.f)); + uint32x4x4_t vec_res_idx{ 0 }; + auto vec_res_value1 = vdupq_n_u32(0); + auto vec_res_value2 = vdupq_n_u32(0); + auto vec_res_value3 = vdupq_n_u32(0); + auto vec_res_value4 = vdupq_n_u32(0); + auto vec_res_value = wrapper::vloadq(input.ptr()); + for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) { uint8_t *in_ptr; @@ -306,169 +638,78 @@ struct RedOpYZW_qasymm8 } const auto vec_elements = wrapper::vloadq(in_ptr); - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1); - vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2); - vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3); - vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4); + switch(op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } } if(op == ReductionOperation::MEAN_SUM) { const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis))); - const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv); - const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv); - const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv); - const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv); - - vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f); - vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f); - vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f); - vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f); - } - - const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2)); - const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4)); - auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - wrapper::vstore(output.ptr(), res); - }, - input, output); - } -}; - -void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis) -{ - switch(axis) - { - case 0: - switch(input->info()->data_type()) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceX(window, input, output, RedOpX()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceX(window, input, output, RedOpX()); - case DataType::QASYMM8: - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 1: - switch(input->info()->data_type()) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceY(window, input, output, RedOpYZW()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceY(window, input, output, RedOpYZW()); - case DataType::QASYMM8: - default: - ARM_COMPUTE_ERROR("Not supported"); + const auto vec_res_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value1), vec_width_inv); + const auto vec_res_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value2), vec_width_inv); + const auto vec_res_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value3), vec_width_inv); + const auto vec_res_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_res_value4), vec_width_inv); + + vec_res_value1 = vcvtq_u32_f32(vec_res_value1_f); + vec_res_value2 = vcvtq_u32_f32(vec_res_value2_f); + vec_res_value3 = vcvtq_u32_f32(vec_res_value3_f); + vec_res_value4 = vcvtq_u32_f32(vec_res_value4_f); } - case 2: - switch(input->info()->data_type()) + if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceZ(window, input, output, RedOpYZW()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceZ(window, input, output, RedOpYZW()); - case DataType::QASYMM8: - default: - ARM_COMPUTE_ERROR("Not supported"); + wrapper::vstore(reinterpret_cast(output.ptr()), vec_res_idx.val[0]); + wrapper::vstore(reinterpret_cast(output.ptr()) + 4, vec_res_idx.val[1]); + wrapper::vstore(reinterpret_cast(output.ptr()) + 8, vec_res_idx.val[2]); + wrapper::vstore(reinterpret_cast(output.ptr()) + 12, vec_res_idx.val[3]); } - case 3: - switch(input->info()->data_type()) + else { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceW(window, input, output, RedOpYZW()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceW(window, input, output, RedOpYZW()); - case DataType::QASYMM8: - default: - ARM_COMPUTE_ERROR("Not supported"); + const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + wrapper::vstore(output.ptr(), res); } - default: - ARM_COMPUTE_ERROR("Unsupported reduction axis"); - } -} -void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis) -{ - switch(axis) - { - case 0: - switch(input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer>::reduceX(window, input, output, RedOpX_qasymm8()); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceX(window, input, output, RedOpX()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceX(window, input, output, RedOpX()); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 1: - switch(input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer>::reduceY(window, input, output, RedOpYZW_qasymm8()); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceY(window, input, output, RedOpYZW()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceY(window, input, output, RedOpYZW()); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 2: - switch(input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer>::reduceZ(window, input, output, RedOpYZW_qasymm8()); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceZ(window, input, output, RedOpYZW()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceZ(window, input, output, RedOpYZW()); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 3: - switch(input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer>::reduceW(window, input, output, RedOpYZW_qasymm8()); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer>::reduceW(window, input, output, RedOpYZW()); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer>::reduceW(window, input, output, RedOpYZW()); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - default: - ARM_COMPUTE_ERROR("Unsupported reduction axis"); + }, + input, output); } -} -void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis) +}; + +void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) { switch(axis) { @@ -476,13 +717,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output switch(input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceX(window, input, output, RedOpX_qasymm8()); + return Reducer::reduceX(window, input, output, RedOpX_qasymm8(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceX(window, input, output, RedOpX()); + return Reducer>::reduceX(window, input, output, RedOpX(), op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: - return Reducer>::reduceX(window, input, output, RedOpX()); + return Reducer>::reduceX(window, input, output, RedOpX(), op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -490,13 +731,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output switch(input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceY(window, input, output, RedOpYZW_qasymm8()); + return Reducer::reduceY(window, input, output, RedOpYZW_qasymm8(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceY(window, input, output, RedOpYZW()); + return Reducer>::reduceY(window, input, output, RedOpYZW(), op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: - return Reducer>::reduceY(window, input, output, RedOpYZW()); + return Reducer>::reduceY(window, input, output, RedOpYZW(), op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -504,13 +745,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output switch(input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceZ(window, input, output, RedOpYZW_qasymm8()); + return Reducer::reduceZ(window, input, output, RedOpYZW_qasymm8(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceZ(window, input, output, RedOpYZW()); + return Reducer>::reduceZ(window, input, output, RedOpYZW(), op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: - return Reducer>::reduceZ(window, input, output, RedOpYZW()); + return Reducer>::reduceZ(window, input, output, RedOpYZW(), op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -518,13 +759,13 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output switch(input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceW(window, input, output, RedOpYZW_qasymm8()); + return Reducer::reduceW(window, input, output, RedOpYZW_qasymm8(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceW(window, input, output, RedOpYZW()); + return Reducer>::reduceW(window, input, output, RedOpYZW(), op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: - return Reducer>::reduceW(window, input, output, RedOpYZW()); + return Reducer>::reduceW(window, input, output, RedOpYZW(), op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -533,14 +774,6 @@ void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output } } -TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis) -{ - TensorShape output_shape{ input_shape }; - output_shape.set(axis, 1); - - return output_shape; -} - Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) { ARM_COMPUTE_UNUSED(op); @@ -553,10 +786,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); + if(!is_arg_min_max) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); } @@ -564,13 +806,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u return Status{}; } -std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis) +std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op) { // Calculate output shape and set if empty - const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); // Output auto initialization if not yet initialized - auto_init_if_empty(*output, output_shape, 1, input->data_type()); + const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); + DataType output_data_type = is_arg_min_max ? DataType::U32 : input->data_type(); + auto_init_if_empty(*output, output_shape, 1, output_data_type); unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); @@ -613,7 +857,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output _reduction_axis = axis; // Configure kernel window - auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); + auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); @@ -623,7 +867,7 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op))); return Status{}; } @@ -634,19 +878,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_op) - { - case ReductionOperation::SUM_SQUARE: - reduce_sumsq(window, _input, _output, _reduction_axis); - break; - case ReductionOperation::MEAN_SUM: - reduce_mean_sum(window, _input, _output, _reduction_axis); - break; - case ReductionOperation::SUM: - reduce_sum(window, _input, _output, _reduction_axis); - break; - default: - ARM_COMPUTE_ERROR("Unsupported reduction operation."); - } + reduce_op(window, _input, _output, _reduction_axis, _op); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp index 0c134c00ed..f2697bcc6d 100644 --- a/src/core/NEON/kernels/NESelectKernel.cpp +++ b/src/core/NEON/kernels/NESelectKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -14,9 +14,9 @@ * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. @@ -67,7 +67,7 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen const auto c = (*condition_conversion)(condition_ptr + x); const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::vbitselect(c, a, b)); + wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); } for(; x < window_end_x; ++x) { @@ -90,7 +90,7 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT select_op(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) { static const auto zero = wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vgreaterthan(wrapper::vloadq(condition_ptr), zero); + return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); }); } @@ -104,7 +104,7 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I select_op(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) { static const auto zero = wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vgreaterthan(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); + return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); }); } @@ -118,7 +118,7 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I select_op(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) { static const auto zero = wrapper::vdup_n(static_cast(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vgreaterthan(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); + return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); }); } diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp new file mode 100644 index 0000000000..d33e1342b9 --- /dev/null +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false) +{ +} +void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op) +{ + _reduction_kernel.configure(input, output, axis, op); + + if(axis == 0) + { + _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE); + _run_fill_border = true; + } +} + +Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation"); + ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op)); + return Status{}; +} + +void NEArgMinMaxLayer::run() +{ + _memory_group.acquire(); + + if(_run_fill_border) + { + NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_reduction_kernel, Window::DimY); + + _memory_group.release(); +} + +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp new file mode 100644 index 0000000000..611495a41d --- /dev/null +++ b/tests/validation/NEON/ArgMinMax.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h" +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/TensorAllocator.h" + +#include "tests/NEON/Accessor.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/SplitDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/ArgMinMaxFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(NEON) +TEST_SUITE(ArgMinMax) + +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis + TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape + TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) // Invalid operation + }), + framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::U32), + TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32) + })), + framework::dataset::make("Axis", { 4, 0, 2, 0 })), + framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::ARG_IDX_MAX, ReductionOperation::MEAN_SUM })), + framework::dataset::make("Expected", { false, false, true, false })), + input_info, output_info, axis, operation, expected) +{ + const Status status = NEArgMinMaxLayer::validate(&input_info.clone()->set_is_resizable(false), axis, &output_info.clone()->set_is_resizable(false), operation); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +DATA_TEST_CASE(Configuration, + framework::DatasetMode::ALL, + combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })), + shape, data_type) +{ + // Create tensors + Tensor ref_src = create_tensor(shape, data_type); + Tensor dst; + + // Create and Configure function + NEArgMinMaxLayer arg_min_max_layer; + arg_min_max_layer.configure(&ref_src, 1, &dst, ReductionOperation::ARG_IDX_MAX); + + // Validate valid region + TensorShape output_shape = shape; + output_shape.set(1, 1); + const ValidRegion valid_region = shape_to_valid_region(output_shape); + validate(dst.info()->valid_region(), valid_region); +} + +template +using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture; + +TEST_SUITE(Float) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEArgMinMaxValidationFixture, + framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + NEArgMinMaxValidationFixture, + framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // FP16 +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template +using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture; + +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEArgMinMaxQuantizedValidationFixture, + framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), + framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + NEArgMinMaxQuantizedValidationFixture, + framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), + framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // QASYMM8 + +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, + NEArgMinMaxValidationFixture, + framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + NEArgMinMaxValidationFixture, + framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX }))) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // FP32 +TEST_SUITE_END() // Float +TEST_SUITE_END() // ArgMinMax +TEST_SUITE_END() // NEON +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h index 5f5f85c104..e263b25bf2 100644 --- a/tests/validation/fixtures/ArgMinMaxFixture.h +++ b/tests/validation/fixtures/ArgMinMaxFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,28 +42,38 @@ namespace test namespace validation { template -class ArgMinMaxValidationFixture : public framework::Fixture +class ArgMinMaxValidationBaseFixture : public framework::Fixture { public: template - void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op) + void setup(TensorShape shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info) { - _target = compute_target(shape, data_type, axis, op); - _reference = compute_reference(shape, data_type, axis, op); + _target = compute_target(shape, data_type, axis, op, q_info); + _reference = compute_reference(shape, data_type, axis, op, q_info); } protected: template void fill(U &&tensor) { - std::uniform_real_distribution<> distribution(-1.0f, 1.0f); - library->fill(tensor, distribution, 0); + if(!is_data_type_quantized(tensor.data_type())) + { + std::uniform_real_distribution<> distribution(-1.0f, 1.0f); + library->fill(tensor, distribution, 0); + } + else + { + std::pair bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f); + std::uniform_int_distribution distribution(bounds.first, bounds.second); + + library->fill(tensor, distribution, 0); + } } - TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op) + TensorType compute_target(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info) { // Create tensors - TensorType src = create_tensor(src_shape, data_type, 1); + TensorType src = create_tensor(src_shape, data_type, 1, q_info); TensorType dst; // Create and configure function @@ -89,21 +99,43 @@ protected: return dst; } - SimpleTensor compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op) + SimpleTensor compute_reference(TensorShape &src_shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo q_info) { // Create reference - SimpleTensor src{ src_shape, data_type, 1 }; + SimpleTensor src{ src_shape, data_type, 1, q_info }; // Fill reference fill(src); TensorShape output_shape = src_shape; output_shape.set(axis, 1); - return reference::reduction_operation(src, output_shape, axis, op); + return reference::reduction_operation(src, output_shape, axis, op); } - TensorType _target{}; - SimpleTensor _reference{}; + TensorType _target{}; + SimpleTensor _reference{}; +}; + +template +class ArgMinMaxValidationQuantizedFixture : public ArgMinMaxValidationBaseFixture +{ +public: + template + void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op, QuantizationInfo quantization_info) + { + ArgMinMaxValidationBaseFixture::setup(shape, data_type, axis, op, quantization_info); + } +}; + +template +class ArgMinMaxValidationFixture : public ArgMinMaxValidationBaseFixture +{ +public: + template + void setup(const TensorShape &shape, DataType data_type, int axis, ReductionOperation op) + { + ArgMinMaxValidationBaseFixture::setup(shape, data_type, axis, op, QuantizationInfo()); + } }; } // namespace validation } // namespace test diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h index 769d7f674f..44bb9fca6a 100644 --- a/tests/validation/fixtures/ReduceMeanFixture.h +++ b/tests/validation/fixtures/ReduceMeanFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -113,7 +113,7 @@ protected: { TensorShape output_shape = i == 0 ? src_shape : out.shape(); output_shape.set(axis[i], 1); - out = reference::reduction_operation(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM); + out = reference::reduction_operation(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM); } if(!keep_dims) diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h index 9079b47cbb..d01f41abf0 100644 --- a/tests/validation/fixtures/ReductionOperationFixture.h +++ b/tests/validation/fixtures/ReductionOperationFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -107,7 +107,7 @@ protected: // Fill reference fill(src); - return reference::reduction_operation(src, dst_shape, axis, op); + return reference::reduction_operation(src, dst_shape, axis, op); } TensorType _target{}; diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp index fcd6226f07..43885b29e2 100644 --- a/tests/validation/reference/L2NormalizeLayer.cpp +++ b/tests/validation/reference/L2NormalizeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,7 +54,7 @@ SimpleTensor l2_normalize(const SimpleTensor &src, unsigned int axis, floa SimpleTensor dst{ src.shape(), src.data_type() }; // Reduce across given axis - SimpleTensor sum = reduction_operation(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE); + SimpleTensor sum = reduction_operation(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE); // Compute reference const int upper_dims = src.shape().total_size_upper(axis + 1); diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp index 37a9be86c0..fc12e31d75 100644 --- a/tests/validation/reference/ReductionOperation.cpp +++ b/tests/validation/reference/ReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -49,20 +49,20 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in uint32_t int_res = 0; for(int i = 0; i < reduce_elements; ++i) { - auto elem = static_cast(*(ptr + stride * i)); + auto elem = *(ptr + stride * i); switch(op) { case ReductionOperation::ARG_IDX_MIN: - if(static_cast(*(ptr + stride * static_cast(res))) > elem) + if(*(ptr + stride * static_cast(int_res)) > elem) { - res = static_cast(i); + int_res = static_cast(i); } break; case ReductionOperation::ARG_IDX_MAX: - if(static_cast(*(ptr + stride * static_cast(res))) < elem) + if(*(ptr + stride * static_cast(int_res)) < elem) { - res = static_cast(i); + int_res = static_cast(i); } break; case ReductionOperation::SUM_SQUARE: @@ -122,13 +122,13 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in } } // namespace -template -SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op) +template +SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op) { // Create reference const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); DataType output_data_type = is_arg_min_max ? DataType::U32 : src.data_type(); - SimpleTensor dst{ dst_shape, output_data_type, 1, src.quantization_info() }; + SimpleTensor dst{ dst_shape, output_data_type, 1, src.quantization_info() }; const unsigned int src_width = src.shape().x(); const unsigned int src_height = src.shape().y(); const unsigned int src_depth = src.shape().z(); @@ -143,14 +143,7 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap for(unsigned int du = 0; du < upper_dims; ++du) { const T *src_row_ptr = src.data() + du * reduce_elems; - if(is_arg_min_max) - { - dst[du] = reduce_operation(src_row_ptr, reduce_elems, op, 1); - } - else - { - dst[du] = reduce_operation(src_row_ptr, reduce_elems, op, 1); - } + dst[du] = reduce_operation(src_row_ptr, reduce_elems, op, 1); } } break; @@ -164,15 +157,7 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap const int in_offset = du * src_height * src_width + x; const int out_offset = du * src_width + x; const T *src_row_ptr = src.data() + in_offset; - - if(is_arg_min_max) - { - dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_width); - } - else - { - dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_width); - } + dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_width); } } } @@ -189,15 +174,7 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap const int in_offset = du * src_depth * src_height * src_width + y * src_width + x; const int out_offset = du * src_width * src_height + y * src_width + x; const T *src_row_ptr = src.data() + in_offset; - - if(is_arg_min_max) - { - dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_height * src_width); - } - else - { - dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_height * src_width); - } + dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_height * src_width); } } } @@ -217,14 +194,7 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap const int in_offset = du * src_batch * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x; const int out_offset = du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x; const T *src_row_ptr = src.data() + in_offset; - if(is_arg_min_max) - { - dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth); - } - else - { - dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth); - } + dst[out_offset] = reduce_operation(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth); } } } @@ -238,6 +208,9 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap return dst; } +template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); +template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); +template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h index 859b57aa7b..9f7050f551 100644 --- a/tests/validation/reference/ReductionOperation.h +++ b/tests/validation/reference/ReductionOperation.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -35,10 +35,10 @@ namespace validation { namespace reference { -template -SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); +template +SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); } // namespace reference } // namespace validation } // namespace test } // namespace arm_compute -#endif /* __ARM_COMPUTE_TEST_FLOOR_H__ */ +#endif /* __ARM_COMPUTE_TEST_REDUCTION_OPERATION_H__ */ -- cgit v1.2.1