diff options
53 files changed, 3581 insertions, 2430 deletions
diff --git a/Android.bp b/Android.bp index ab554a8ca2..1f1e591bd1 100644 --- a/Android.bp +++ b/Android.bp @@ -489,6 +489,8 @@ cc_library_static { "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp", + "src/cpu/kernels/dequantize/generic/neon/fp16.cpp", + "src/cpu/kernels/dequantize/generic/neon/fp32.cpp", "src/cpu/kernels/directconv2d/nchw/all.cpp", "src/cpu/kernels/directconv2d/nchw/fp16.cpp", "src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp", @@ -554,9 +556,17 @@ cc_library_static { "src/cpu/kernels/pool3d/neon/fp32.cpp", "src/cpu/kernels/pool3d/neon/qasymm8.cpp", "src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp", + "src/cpu/kernels/quantize/generic/neon/fp16.cpp", + "src/cpu/kernels/quantize/generic/neon/fp32.cpp", + "src/cpu/kernels/quantize/generic/neon/integer.cpp", "src/cpu/kernels/range/generic/neon/fp16.cpp", "src/cpu/kernels/range/generic/neon/fp32.cpp", "src/cpu/kernels/range/generic/neon/integer.cpp", + "src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp", + "src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp", + "src/cpu/kernels/reduction_layer/generic/neon/integer.cpp", + "src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp", + "src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp", "src/cpu/kernels/roialign/generic/neon/fp16.cpp", "src/cpu/kernels/roialign/generic/neon/fp32.cpp", "src/cpu/kernels/roialign/generic/neon/qasymm8.cpp", diff --git a/CMakeLists.txt b/CMakeLists.txt index c67479ce41..f291534201 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute) project( ArmCompute - VERSION 36.0.0 + VERSION 38.0.0 DESCRIPTION "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures" LANGUAGES C CXX ASM) @@ -9,7 +9,7 @@ <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br> </div> -# Compute Library ![](https://img.shields.io/badge/latest_release-24.04-green) +# Compute Library ![](https://img.shields.io/badge/latest_release-24.06-green) The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br> @@ -37,7 +37,7 @@ Key Features: <br> ## Documentation -[![Documentation](https://img.shields.io/badge/documentation-24.04-green)](https://arm-software.github.io/ComputeLibrary/latest) +[![Documentation](https://img.shields.io/badge/documentation-24.06-green)](https://arm-software.github.io/ComputeLibrary/latest) > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc. @@ -50,24 +50,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C | Platform | Operating System | Release archive (Download) | | -------------- | ---------------- | -------------------------- | -| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon.tar.gz) | -| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) | -| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) | -| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) | +| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-neon.tar.gz) | +| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) | +| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon-cl.tar.gz) | +| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon-cl.tar.gz) | <br> | Architecture | Operating System | Release archive (Download) | | ------------ | ---------------- | -------------------------- | -| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-armv7a-neon-cl.tar.gz) | -| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8a-neon-cl.tar.gz) | -| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8a-neon-cl.tar.gz) | -| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-android-arm64-v8.2-a-neon-cl.tar.gz) | -| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.04/arm_compute-v24.04-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) | +| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-armv7a-neon-cl.tar.gz) | +| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8a-neon-cl.tar.gz) | +| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8a-neon-cl.tar.gz) | +| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-android-arm64-v8.2-a-neon-cl.tar.gz) | +| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.06/arm_compute-v24.06-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) | <br> -Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.04-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.04) +Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.06-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.06) Pre-build binaries are generated with the following security / good coding practices related flags: > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong diff --git a/SConscript b/SConscript index 80aa87cae8..325506ed40 100644 --- a/SConscript +++ b/SConscript @@ -32,8 +32,8 @@ import json import codecs import platform -VERSION = "v0.0-unreleased" -LIBRARY_VERSION_MAJOR = 36 +VERSION = "v24.06" +LIBRARY_VERSION_MAJOR = 38 LIBRARY_VERSION_MINOR = 0 LIBRARY_VERSION_PATCH = 0 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH) diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index 139d630fd7..c97751bc0c 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -170,7 +170,12 @@ public: * @return Number of CPUs */ unsigned int get_cpu_num() const; - + /** Return the maximum number of CPUs present excluding the little cores + * in case of an Android device + * + * @return Number of CPUs excluding little + */ + unsigned int get_cpu_num_excluding_little() const; /** Return the vector length in bytes for sme2 * * @return Vector length if sme2 is enabled, otherwise returns 0. diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h index 9390d0c54f..83b12d572e 100644 --- a/arm_compute/function_info/ActivationLayerInfo.h +++ b/arm_compute/function_info/ActivationLayerInfo.h @@ -121,6 +121,20 @@ public: _lut_fp16 = lut; } #endif // __aarch64__ + + // The < and == are added to be able to use this data type as an attribute for LUTInfo + friend bool operator<(const ActivationLayerInfo &l, const ActivationLayerInfo &r) + { + const auto l_tup = std::make_tuple(l._act, l._a, l._b, l._enabled); + const auto r_tup = std::make_tuple(r._act, r._a, r._b, r._enabled); + + return l_tup < r_tup; + } + bool operator==(const ActivationLayerInfo &l) const + { + return this->_act == l._act && this->_a == l._a && this->_b == l._b && this->_enabled == l._enabled; + } + private: ActivationFunction _act = {ActivationLayerInfo::ActivationFunction::IDENTITY}; float _a = {}; diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h index b522b403a9..9b39714fea 100644 --- a/arm_compute/runtime/OMP/OMPScheduler.h +++ b/arm_compute/runtime/OMP/OMPScheduler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_OMPSCHEDULER_H -#define ARM_COMPUTE_OMPSCHEDULER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H +#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H #include "arm_compute/runtime/IScheduler.h" @@ -79,6 +79,7 @@ protected: private: unsigned int _num_threads; + unsigned int _nonlittle_num_cpus; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_OMPSCHEDULER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox index 056e45a432..c195dc7851 100644 --- a/docs/user_guide/errata.dox +++ b/docs/user_guide/errata.dox @@ -1,5 +1,5 @@ /// -/// Copyright (c) 2019-2023 Arm Limited. +/// Copyright (c) 2019-2024 Arm Limited. /// /// SPDX-License-Identifier: MIT /// @@ -30,6 +30,14 @@ namespace arm_compute @section S7_1_errata Errata +- (COMPMID-6904) Fix out-of-bound memory write for non-optimized FP16 GeMM kernel. + - Versions: >= v17.09 && < v24.06 + - Oses: Linux, Android, MacOS, Windows. + - Conditions: + - Compile the latest Arm Compute Library for armv8.2-a or multi_isa + - Device with FP16 support + - GeMM with beta coefficient != 0 or 1 + - (COMPMID-6493) Crash when running Arm Compute Library compiled for SVE2 on a computer that support SVE only. - Versions: >= v21.02 && <=v23.08 - OSs: Linux, Android. diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index ce183d209c..16664c8d84 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -41,8 +41,16 @@ If there is more than one release in a month then an extra sequential number is @section S2_2_changelog Changelog +v24.06 Public minor release + - Enable FP16 in multiple Neon™ kernels for multi_isa + v8a + - Fix OpenMP® thread scheduling for large machine + - Optimize CPU activation functions using LUT-based implementation: + - Tanh function for FP16. + v24.05 Public major release - Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types + - Various fixes to enable FP16 kernels in armv8a multi_isa builds. + - Updated logic in the OpenMP scheduler to exclude LITTLE cores. v24.04 Public major release - Add Bfloat16 data type support for @ref NEMatMul. diff --git a/filelist.json b/filelist.json index eed9ccd54b..e833de9fc7 100644 --- a/filelist.json +++ b/filelist.json @@ -1415,7 +1415,11 @@ "src/cpu/operators/CpuDequantize.cpp", "src/cpu/kernels/CpuDequantizeKernel.cpp", "src/runtime/NEON/functions/NEDequantizationLayer.cpp" - ] + ], + "neon":{ + "fp32":["src/cpu/kernels/dequantize/generic/neon/fp32.cpp"], + "fp16":["src/cpu/kernels/dequantize/generic/neon/fp16.cpp"] + } } }, "DetectionPostProcess": { @@ -1593,7 +1597,6 @@ "neon": { "common": [ "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp", - "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp", "src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp", @@ -1605,7 +1608,6 @@ "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp", "src/core/NEON/kernels/arm_gemm/interleave-8way.cpp", "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp", - "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults.cpp", "src/core/NEON/kernels/arm_gemm/misc.cpp", "src/core/NEON/kernels/arm_gemm/quantized.cpp", @@ -1622,13 +1624,8 @@ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp", @@ -1682,6 +1679,15 @@ "fp32":["src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp", "src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp"], "fp16":["src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", "src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp"], "estate32": [ "src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp", @@ -1690,16 +1696,15 @@ ], "estate64": [ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp" + ], "fixed_format_kernels": [ "src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_6x16/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp" ] }, @@ -2093,7 +2098,12 @@ "src/cpu/operators/CpuQuantize.cpp", "src/cpu/kernels/CpuQuantizeKernel.cpp", "src/runtime/NEON/functions/NEQuantizationLayer.cpp" - ] + ], + "neon":{ + "fp32":["src/cpu/kernels/quantize/generic/neon/fp32.cpp"], + "fp16":["src/cpu/kernels/quantize/generic/neon/fp16.cpp"], + "integer":["src/cpu/kernels/quantize/generic/neon/integer.cpp"] + } } }, "Range": { @@ -2115,7 +2125,14 @@ "common": [ "src/core/NEON/kernels/NEReductionOperationKernel.cpp", "src/runtime/NEON/functions/NEReductionOperation.cpp" - ] + ], + "neon":{ + "fp32":["src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp"], + "fp16":["src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp"], + "integer":["src/cpu/kernels/reduction_layer/generic/neon/integer.cpp"], + "qasymm8":["src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp"], + "qasymm8_signed":["src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp"] + } } }, "Reorg": { diff --git a/src/BUILD.bazel b/src/BUILD.bazel index 8b8842dfce..f270824ab4 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -753,6 +753,8 @@ filegroup( "cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp", + "cpu/kernels/dequantize/generic/neon/fp16.cpp", + "cpu/kernels/dequantize/generic/neon/fp32.cpp", "cpu/kernels/directconv2d/nchw/all.cpp", "cpu/kernels/directconv2d/nchw/fp16.cpp", "cpu/kernels/directconv2d/nhwc/neon/fp16.cpp", @@ -818,9 +820,17 @@ filegroup( "cpu/kernels/pool3d/neon/fp32.cpp", "cpu/kernels/pool3d/neon/qasymm8.cpp", "cpu/kernels/pool3d/neon/qasymm8_signed.cpp", + "cpu/kernels/quantize/generic/neon/fp16.cpp", + "cpu/kernels/quantize/generic/neon/fp32.cpp", + "cpu/kernels/quantize/generic/neon/integer.cpp", "cpu/kernels/range/generic/neon/fp16.cpp", "cpu/kernels/range/generic/neon/fp32.cpp", "cpu/kernels/range/generic/neon/integer.cpp", + "cpu/kernels/reduction_layer/generic/neon/fp16.cpp", + "cpu/kernels/reduction_layer/generic/neon/fp32.cpp", + "cpu/kernels/reduction_layer/generic/neon/integer.cpp", + "cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp", + "cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp", "cpu/kernels/roialign/generic/neon/fp16.cpp", "cpu/kernels/roialign/generic/neon/fp32.cpp", "cpu/kernels/roialign/generic/neon/qasymm8.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 74be9e5fac..87c5f8b21d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -744,6 +744,8 @@ target_sources( cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp + cpu/kernels/dequantize/generic/neon/fp16.cpp + cpu/kernels/dequantize/generic/neon/fp32.cpp cpu/kernels/directconv2d/nchw/all.cpp cpu/kernels/directconv2d/nchw/fp16.cpp cpu/kernels/directconv2d/nhwc/neon/fp16.cpp @@ -809,9 +811,17 @@ target_sources( cpu/kernels/pool3d/neon/fp32.cpp cpu/kernels/pool3d/neon/qasymm8.cpp cpu/kernels/pool3d/neon/qasymm8_signed.cpp + cpu/kernels/quantize/generic/neon/fp16.cpp + cpu/kernels/quantize/generic/neon/fp32.cpp + cpu/kernels/quantize/generic/neon/integer.cpp cpu/kernels/range/generic/neon/fp16.cpp cpu/kernels/range/generic/neon/fp32.cpp cpu/kernels/range/generic/neon/integer.cpp + cpu/kernels/reduction_layer/generic/neon/fp16.cpp + cpu/kernels/reduction_layer/generic/neon/fp32.cpp + cpu/kernels/reduction_layer/generic/neon/integer.cpp + cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp + cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp cpu/kernels/roialign/generic/neon/fp16.cpp cpu/kernels/roialign/generic/neon/fp32.cpp cpu/kernels/roialign/generic/neon/qasymm8.cpp diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp index 93f51e599a..d46d8d7773 100644 --- a/src/common/cpuinfo/CpuInfo.cpp +++ b/src/common/cpuinfo/CpuInfo.cpp @@ -29,6 +29,7 @@ #include "support/StringSupport.h" #include "support/ToolchainSupport.h" +#include <map> #include <sstream> #if !defined(BARE_METAL) @@ -269,6 +270,46 @@ int get_max_cpus() } return max_cpus; } +#if defined(__ANDROID__) +std::vector<uint32_t> get_cpu_capacities() +{ + std::vector<uint32_t> cpu_capacities; + for (int i = 0; i < get_max_cpus(); ++i) + { + std::stringstream str; + str << "/sys/devices/system/cpu/cpu" << i << "/cpu_capacity"; + std::ifstream file(str.str(), std::ios::in); + if (file.is_open()) + { + std::string line; + if (bool(getline(file, line))) + { + cpu_capacities.emplace_back(support::cpp11::stoul(line)); + } + } + } + + return cpu_capacities; +} + +uint32_t not_little_num_cpus_internal() +{ + std::vector<uint32_t> cpus_all = get_cpu_capacities(); + std::vector<uint32_t> cpus_not_little; + + std::vector<uint32_t>::iterator result = std::max_element(cpus_all.begin(), cpus_all.end()); + uint32_t max_capacity = *result; + uint32_t threshold = max_capacity / 2; + for (unsigned int i = 0; i < cpus_all.size(); i++) + { + if (!(cpus_all[i] < threshold)) + { + cpus_not_little.emplace_back(cpus_all[i]); + } + } + return cpus_not_little.size(); +} +#endif /* defined(__ANDROID__) */ #elif defined(__aarch64__) && \ defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ /** Query features through sysctlbyname @@ -363,6 +404,8 @@ CpuInfo CpuInfo::build() isainfo.neon = get_hw_capability("hw.optional.neon"); isainfo.fp16 = get_hw_capability("hw.optional.neon_fp16"); isainfo.dot = get_hw_capability("hw.optional.arm.FEAT_DotProd"); + isainfo.bf16 = get_hw_capability("hw.optional.arm.FEAT_BF16"); + isainfo.i8mm = get_hw_capability("hw.optional.arm.FEAT_I8MM"); CpuInfo info(isainfo, cpus_model); return info; #elif defined(__aarch64__) && defined(_WIN64) /* #elif defined(__aarch64__) && defined(__APPLE__) */ @@ -400,6 +443,15 @@ uint32_t CpuInfo::num_cpus() const return _cpus.size(); } +uint32_t CpuInfo::not_little_num_cpus() const +{ +#if defined(__ANDROID__) + return not_little_num_cpus_internal(); +#else /* defined(__ANDROID__) */ + return num_cpus(); +#endif /* defined(__ANDROID__) */ +} + uint32_t num_threads_hint() { unsigned int num_threads_hint = 1; diff --git a/src/common/cpuinfo/CpuInfo.h b/src/common/cpuinfo/CpuInfo.h index 953e4883c3..78d11e9610 100644 --- a/src/common/cpuinfo/CpuInfo.h +++ b/src/common/cpuinfo/CpuInfo.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_COMMON_CPUINFO_H -#define SRC_COMMON_CPUINFO_H +#ifndef ACL_SRC_COMMON_CPUINFO_CPUINFO_H +#define ACL_SRC_COMMON_CPUINFO_CPUINFO_H #include "src/common/cpuinfo/CpuIsaInfo.h" #include "src/common/cpuinfo/CpuModel.h" @@ -120,6 +120,7 @@ public: CpuModel cpu_model(uint32_t cpuid) const; CpuModel cpu_model() const; uint32_t num_cpus() const; + uint32_t not_little_num_cpus() const; private: CpuIsaInfo _isa{}; @@ -135,4 +136,4 @@ private: uint32_t num_threads_hint(); } // namespace cpuinfo } // namespace arm_compute -#endif /* SRC_COMMON_CPUINFO_H */ +#endif // ACL_SRC_COMMON_CPUINFO_CPUINFO_H diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp index f6761f27b0..ee39210fa5 100644 --- a/src/core/CPP/CPPTypes.cpp +++ b/src/core/CPP/CPPTypes.cpp @@ -140,10 +140,20 @@ unsigned int CPUInfo::get_L2_cache_size() const unsigned long CPUInfo::get_sme2_vector_length() const { #ifdef ARM_COMPUTE_ENABLE_SME2 - return arm_gemm::utils::sme::get_vector_length<int8_t>(); + if (this->has_sme2()) + return arm_gemm::utils::sme::get_vector_length<int8_t>(); + else + return 0; #else // ARM_COMPUTE_ENABLE_SME2 return 0; #endif // ARM_COMPUTE_ENABLE_SME2 } - +unsigned int CPUInfo::get_cpu_num_excluding_little() const +{ +#if defined(__ANDROID__) + return _impl->info.not_little_num_cpus(); +#else /* defined(__ANDROID__) */ + return get_cpu_num(); +#endif /* defined(__ANDROID__) */ +} } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 717fd11485..153c36052a 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,11 +78,11 @@ static const BatchNormalizationKernel available_kernels[] = { REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if ARM_COMPUTE_ENABLE_FP16 {"neon_fp16_batch_normalization", [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)}, -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* ARM_COMPUTE_ENABLE_FP16 */ {"neon_fp32_batch_normalization", [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)}, diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp index cb869838e2..694def1a3a 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,11 +63,11 @@ static const BoundingBoxTransformKernel available_kernels[] = { {"fp32_neon_boundingboxtransform", [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)}, -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 {"fp16_neon_boundingboxtransform", [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)}, -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // ARM_COMPUTE_ENABLE_FP16 #if defined(ARM_COMPUTE_ENABLE_NEON) {"qu16_neon_boundingboxtransform", [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; }, diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp index 549319e49f..e23e3d020f 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,10 +61,10 @@ static const ComputeAllAnchorsKernel available_kernels[] = { {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; }, REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)}, -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // ARM_COMPUTE_ENABLE_FP16 {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)}, }; diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp index 0a1780f6ee..5883731088 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -70,10 +70,10 @@ struct InstanceNormKernel static const InstanceNormKernel available_kernels[] = { {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)}, -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)}, -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // ARM_COMPUTE_ENABLE_FP16 }; /** Micro-kernel selector diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index 451031d696..cfe4ac9a4c 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,10 +60,10 @@ struct MeanStdDevNormKernel static const std::vector<MeanStdDevNormKernel> available_kernels = { {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)}, -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)}, -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // ARM_COMPUTE_ENABLE_FP16 {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)}, }; diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 455d604b3b..5380e6ccce 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,1747 +31,221 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/INEKernel.h" -#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "support/SaturateCast.h" - -#include <arm_neon.h> +#include "src/cpu/kernels/reduction_layer/generic/neon/list.h" namespace arm_compute { -namespace -{ -// Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized -template <typename T> -void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0) -{ - if (std::is_same<T, uint8_t>::value) - { - auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2)); - wrapper::vstore(output.ptr() + offset, res); - } - else - { - auto res = wrapper::vcombine(wrapper::vqmovn(t1), wrapper::vqmovn(t2)); - wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res); - } -} - -template <typename T> -uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) -{ - uint32x4_t mask{0}; - if (op == ReductionOperation::ARG_IDX_MIN) - { - mask = wrapper::vcgt(b, a); - } - else - { - mask = wrapper::vclt(b, a); - } - - uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3}; - if (axis != 0) - { - vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - } - uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}}; - - return res; -} - -template <typename T> -uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) -{ - uint32x4x4_t mask{{0}}; - uint8x16_t mask_u8{0}; - if (op == ReductionOperation::ARG_IDX_MIN) - { - mask_u8 = wrapper::vcgt(b, a); - } - else - { - mask_u8 = wrapper::vclt(b, a); - } - auto wide_u16_1 = - wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = - wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - mask.val[0] = - wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - mask.val[1] = - wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - mask.val[2] = - wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - mask.val[3] = - wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); - - uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, - {idx + 4, idx + 5, idx + 6, idx + 7}, - {idx + 8, idx + 9, idx + 10, idx + 11}, - {idx + 12, idx + 13, idx + 14, idx + 15}}}; - if (axis != 0) - { - vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - } - uint32x4x4_t res = { - {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), - vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}}; - - return res; -} - -// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. -template <typename T> -inline typename std::enable_if< - std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, - typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type -calculate_min(T in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - return wrapper::vpmin(pmin, pmin); -} - -// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. -template <typename T> -inline typename std::enable_if< - std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, - typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type -calculate_min(T in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmin = wrapper::vpmin(pmin, pmin); - pmin = wrapper::vpmin(pmin, pmin); - return wrapper::vpmin(pmin, pmin); -} - -// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. -template <typename T> -inline typename std::enable_if< - std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, - typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type -calculate_max(T in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - return wrapper::vpmax(pmax, pmax); -} - -// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. -template <typename T> -inline typename std::enable_if< - std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, - typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type -calculate_max(T in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmax = wrapper::vpmax(pmax, pmax); - pmax = wrapper::vpmax(pmax, pmax); - return wrapper::vpmax(pmax, pmax); -} - -template <typename T> -uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) -{ - uint32x4_t res_idx_mask{0}; - uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - - if (op == ReductionOperation::ARG_IDX_MIN) - { - auto pmin = calculate_min(vec_res_value); - auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); - res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); - } - else - { - auto pmax = calculate_max(vec_res_value); - auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); - res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); - } - - res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones); - auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask)); - pmin = wrapper::vpmin(pmin, pmin); - uint32_t res = wrapper::vgetlane(pmin, 0); - - return (res - 0xFFFFFFFF); -} - -template <typename T> -uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) -{ - uint32x4x4_t res_idx_mask{{0}}; - uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - uint8x16_t mask_u8{0}; - if (op == ReductionOperation::ARG_IDX_MIN) - { - auto pmin = calculate_min(vec_res_value); - mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); - } - else - { - auto pmax = calculate_max(vec_res_value); - mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); - } - - // Widen vectors - auto wide_u16_1 = - wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = - wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - auto wide_u32_1 = - wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - auto wide_u32_2 = - wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - auto wide_u32_3 = - wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - auto wide_u32_4 = - wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); - res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); - res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); - res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); - res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4); - res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); - res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); - res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones); - res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones); - - uint32_t res = 0xFFFFFFFF; - int iter = 0; - do - { - auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); - pmin = wrapper::vpmin(pmin, pmin); - res = std::min(wrapper::vgetlane(pmin, 0), res); - iter++; - } while (iter < 4); - - return (res - 0xFFFFFFFF); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -uint32x4x4_t -calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) -{ - uint32x4x2_t mask{0}; - uint16x8_t mask_u16{0}; - if (op == ReductionOperation::ARG_IDX_MIN) - { - mask_u16 = wrapper::vcgt(b, a); - } - else - { - mask_u16 = wrapper::vclt(b, a); - } - mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); - mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); - uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}}; - if (axis != 0) - { - vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); - } - uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), - wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0}; - - return res; -} - -// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. -inline float16x4_t calculate_min(float16x8_t in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmin = wrapper::vpmin(pmin, pmin); - return wrapper::vpmin(pmin, pmin); -} -// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. -inline float16x4_t calculate_max(float16x8_t in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmax = wrapper::vpmax(pmax, pmax); - return wrapper::vpmax(pmax, pmax); -} - -template <> -uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) -{ - uint32x4x2_t res_idx_mask{0}; - uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - uint16x8_t mask_u16; - if (op == ReductionOperation::ARG_IDX_MIN) - { - auto pmin = calculate_min(vec_res_value); - mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); - } - else - { - auto pmax = calculate_max(vec_res_value); - mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); - } - - // Widen vectors - auto wide_u32_1 = - wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); - auto wide_u32_2 = - wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); - res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); - res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); - res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); - res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); - - uint32_t res = 0xFFFFFFFF; - uint32_t iter = 0; - do - { - auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); - pmin = wrapper::vpmin(pmin, pmin); - res = std::min(wrapper::vgetlane(pmin, 0), res); - iter++; - } while (iter < 2); - - return (res - 0xFFFFFFFF); -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <class F> -class Reducer -{ -public: - static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) - { - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 1, 1)); - - f(window, out_window, input, output, op); - } - static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) - { - // Set in window - Window in_window(window); - Window out_window(window); - - in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); - out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1))); - - f(in_window, out_window, input, output, 1, op); - } - static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) - { - // Set in window - Window in_window(window); - Window out_window(window); - - in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); - out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2))); - - f(in_window, out_window, input, output, 2, op); - } - static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) - { - // Set in/out window - Window in_window(window); - Window out_window(window); - - in_window.set(3, Window::Dimension(0, 1, 1)); - out_window.set(3, Window::Dimension(0, 1, 1)); - - f(in_window, out_window, input, output, 3, op); - } -}; - -template <typename T, int S> -struct RedOpX -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; - - inline void operator()( - const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) - { - const size_t input_dim_0 = in->info()->dimension(0); - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(in_window.x().start()); - const auto window_end_x = static_cast<int>(in_window.x().end()); - - Window in_win_no_pad = in_window; - in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, in_win_no_pad); - Iterator output(out, out_window); - - execute_window_loop( - in_win_no_pad, - [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - - auto init_res_value = static_cast<T>(0.f); - switch (op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - init_res_value = static_cast<T>(*input_ptr); - break; - } - case ReductionOperation::PROD: - { - init_res_value = static_cast<T>(1.f); - break; - } - default: - break; - } - auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); - uint32x4x4_t vec_res_idx{{0}}; - - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch (op) - { - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, - vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, - vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch (op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM_SQUARE: - { -#ifdef ARM_COMPUTE_DEBUG_ENABLED - auto res = static_cast<T>(0.f); - for (int i = 0; i < S; ++i) - { - res += wrapper::vgetlane(vec_res_value, i); - } -#else // ARM_COMPUTE_DEBUG_ENABLED - auto carry_res = - wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - for (int i = 0; i < S / 4; ++i) - { - carry_res = wrapper::vpadd(carry_res, carry_res); - } - auto res = wrapper::vgetlane(carry_res, 0); -#endif // ARM_COMPUTE_DEBUG_ENABLED - if (op == ReductionOperation::SUM_SQUARE) - { - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res += (*(input_ptr + x)) * (*(input_ptr + x)); - } - } - else - { - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res += *(input_ptr + x); - } - } - - if (op == ReductionOperation::MEAN_SUM) - { - res /= input_dim_0; - } - - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = - wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - T res = 1; - for (int i = 0; i < S / 2; ++i) - { - res *= wrapper::vgetlane(carry_res, i); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res *= *(input_ptr + x); - } - - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - if (*(input_ptr + x) < res) - { - idx = x; - res = *(input_ptr + x); - } - } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - if (*(input_ptr + x) > res) - { - idx = x; - res = *(input_ptr + x); - } - } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); - } -}; - -template <typename T> -struct RedOpX_quantized -{ - inline void operator()( - const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) - { - using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; - - const auto oq_info = out->info()->quantization_info().uniform(); - - const TensorInfo in_info = *(in->info()); - const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); - - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(in_window.x().start()); - const auto window_end_x = static_cast<int>(in_window.x().end()); - - Window in_win_no_pad = in_window; - in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, in_win_no_pad); - Iterator output(out, out_window); - - const auto in_offset = static_cast<float>(iq_info.offset); - const float in_scale = iq_info.scale; - - const auto out_offset = static_cast<float>(oq_info.offset); - const float out_scale = oq_info.scale; - - const auto num_elements = static_cast<float>(in_info.dimension(0)); - - const float A = in_scale / (out_scale * num_elements); - const float B = out_offset - (in_scale * in_offset) / (out_scale); - - execute_window_loop( - in_win_no_pad, - [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - - auto vec_res_value1 = - wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value2 = - wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value3 = - wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value4 = - wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - - auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f)); - auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f)); - auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f)); - auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f)); - - typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0}; - - if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || - op == ReductionOperation::MIN || op == ReductionOperation::MAX) - { - vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); - } - - uint32x4x4_t vec_res_idx{{0}}; - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch (op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); - const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); - - //de-quantize vec_elements - temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( - x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( - x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch (op) - { - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = - calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - if (*(input_ptr + x) < res) - { - idx = x; - res = *(input_ptr + x); - } - } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = - calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - if (*(input_ptr + x) > res) - { - idx = x; - res = *(input_ptr + x); - } - } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); - carry_res = wrapper::vmul(carry_res, vec_res_value3_f); - carry_res = wrapper::vmul(carry_res, vec_res_value4_f); - - float res = wrapper::vgetlane(carry_res, 0); - res *= wrapper::vgetlane(carry_res, 1); - res *= wrapper::vgetlane(carry_res, 2); - res *= wrapper::vgetlane(carry_res, 3); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - //de-quantize input - if (std::is_same<T, uint8_t>::value) - { - res *= dequantize_qasymm8(*(input_ptr + x), iq_info); - } - else - { - res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); - } - } - - //re-quantize result - if (std::is_same<T, uint8_t>::value) - { - res = quantize_qasymm8(res, iq_info); - } - else - { - res = quantize_qasymm8_signed(res, iq_info); - } - - *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res); - break; - } - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); - carry_res = wrapper::vadd(carry_res, vec_res_value3); - carry_res = wrapper::vadd(carry_res, vec_res_value4); - - auto carry_paddition = - wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); - carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); - auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0)); - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - res += *(input_ptr + x); - } - - if (op == ReductionOperation::MEAN_SUM) - { - const int32_t resFinal = A * (static_cast<float>(res)) + B; - - *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal); - } - else - { - // Subtract accumulated offsets - res -= (in_info.dimension(0) - 1) * iq_info.offset; - *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); - } - - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); - } -}; - -template <typename T, int S> -struct RedOpYZW -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; - using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; - - inline void operator()(const Window &in_window, - Window &out_window, - const ITensor *in, - ITensor *out, - int axis, - const ReductionOperation op) - { - const TensorInfo in_info = *(in->info()); - const int window_step_x = 16 / sizeof(T); - const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); - const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); - // As it split over x-axis, need to set the correct spiltted window start and end. - const auto window_start_x = static_cast<int>(0); - const auto window_end_x = static_cast<int>(in_window.shape().x()); - - Window in_win_no_pad = in_window; - in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); - Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, - Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); - - Iterator input(in, in_win_no_pad); - Iterator output(out, out_win_no_pad); - - execute_window_loop( - in_win_no_pad, - [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - neon_vector vec_res_value = {0}; - switch (op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vloadq(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); - break; - } - default: - { - vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - break; - } - } - uint32x4x4_t vec_res_idx{{0}}; - - for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - const T *in_ptr = - reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - const auto vec_elements = wrapper::vloadq(in_ptr); - switch (op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = - calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = - calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - if (op == ReductionOperation::MEAN_SUM) - { - auto vec_width_inv = - wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{})); - vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); - } - - if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - if (std::is_same<T, float16_t>::value) - { - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]); - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - } - else - { - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value); - } - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto res_value = 0.f; - switch (op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - res_value = *(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - res_value = static_cast<T>(1.f); - break; - } - default: - { - res_value = static_cast<T>(0.f); - break; - } - } - - uint32_t res_idx = 0; - for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - const T *in_ptr = - reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - - switch (op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - res_value += *in_ptr; - break; - case ReductionOperation::SUM_SQUARE: - res_value += *in_ptr * *in_ptr; - break; - case ReductionOperation::PROD: - res_value *= *in_ptr; - break; - case ReductionOperation::ARG_IDX_MIN: - { - if (*in_ptr < res_value) - { - res_value = *in_ptr; - res_idx = dim; - } - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - if (*in_ptr > res_value) - { - res_value = *in_ptr; - res_idx = dim; - } - break; - } - case ReductionOperation::MIN: - { - res_value = *in_ptr < res_value ? *in_ptr : res_value; - break; - } - case ReductionOperation::MAX: - { - res_value = *in_ptr > res_value ? *in_ptr : res_value; - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - if (op == ReductionOperation::MEAN_SUM) - { - res_value /= in_info.dimension(axis); - } - - if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx; - } - else - { - *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value; - } - } - }, - input, output); - } -}; - -template <typename T, int S, int axis, ReductionOperation op> -struct RedOpYZW_complex -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; - using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; - - inline void operator()( - const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) - { - ARM_COMPUTE_ERROR_ON(axis != 2); - ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM); - - const TensorInfo in_info = *(in->info()); - const size_t stride_z = in_info.strides_in_bytes()[axis]; - const int window_step_x = 16 / sizeof(T); - const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); - const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); - // As it split over x-axis, need to set the correct spiltted window start and end. - const auto window_start_x = static_cast<int>(0); - const auto window_end_x = static_cast<int>(in_window.shape().x()); - - Window in_win_no_pad = in_window; - in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); - Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, - Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); - - Iterator input(in, in_win_no_pad); - Iterator output(out, out_win_no_pad); - - execute_window_loop( - in_win_no_pad, - [&](const Coordinates &) - { - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - neon_vector vec_res_value_0 = {0}; - neon_vector vec_res_value_1 = {0}; - - vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - - T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); - for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); - - const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); - const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); - - vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); - vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); - } - - wrapper::vstore(out_ptr, vec_res_value_0); - wrapper::vstore(out_ptr + 4, vec_res_value_1); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto res_value_0 = 0.f; - auto res_value_1 = 0.f; - - T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); - for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - res_value_0 += *in_ptr; - res_value_1 += *(in_ptr + 1); - } - *out_ptr = res_value_0; - *(out_ptr + 1) = res_value_1; - } - }, - input, output); - } -}; - -template <typename T> -struct RedOpYZW_quantized -{ - inline void operator()(const Window &in_window, - Window &out_window, - const ITensor *in, - ITensor *out, - int axis, - const ReductionOperation op) - { - const TensorInfo in_info = *(in->info()); - const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); - using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; - - const auto oq_info = out->info()->quantization_info().uniform(); - - const int window_step_x = 16 / sizeof(T); - const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); - const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); - // As it split over x-axis, need to set the correct spiltted window start and end. - const auto window_start_x = static_cast<int>(0); - const auto window_end_x = static_cast<int>(in_window.shape().x()); - - Window in_win_no_pad = in_window; - in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); - Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, - Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); - - Iterator input(in, in_win_no_pad); - Iterator output(out, out_win_no_pad); - - using vector_type = - typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type; - using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type; - - vector_type vec_res_value1{}; - vector_type vec_res_value2{}; - vector_type vec_res_value3{}; - vector_type vec_res_value4{}; - - vector_type_f vec_res_value1_f{}; - vector_type_f vec_res_value2_f{}; - vector_type_f vec_res_value3_f{}; - vector_type_f vec_res_value4_f{}; - - const float in_offset = static_cast<float>(iq_info.offset); - const float in_scale = iq_info.scale; - - const float out_offset = static_cast<float>(oq_info.offset); - const float out_scale = oq_info.scale; - - const float num_elements = static_cast<float>(in_info.dimension(axis)); - - const float A = in_scale / (out_scale * num_elements); - const float B = out_offset - (in_scale * in_offset) / (out_scale); - - const auto vec_A = wrapper::vdup_n(static_cast<float>(A), wrapper::traits::vector_128_tag{}); - const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{}); - - execute_window_loop( - in_win_no_pad, - [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - uint32x4x4_t vec_res_idx{{0}}; - vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - - vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - - auto vec_res_value = wrapper::vloadq(input_ptr + x); - - for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) - { - const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; - const auto vec_elements = wrapper::vloadq(in_ptr); - switch (op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), - wrapper::traits::vector_128_tag{}); - const auto scale32x4f_4 = - wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); - - //de-quantize vec_elements - temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, - vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, - vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } - case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch (op) - { - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]); - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, - vec_res_idx.val[3]); - break; - } - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value); - break; - } - case ReductionOperation::SUM: - { - // Subtract offsets - auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); - - auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); - auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); - auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); - auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); - vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); - vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); - vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); - vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); - - const auto temp16x8t_1 = - wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); - const auto temp16x8t_2 = - wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); - - combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x); - break; - } - case ReductionOperation::MEAN_SUM: - { - vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A); - vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A); - vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); - vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); - -#ifdef __aarch64__ - vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f); -#else // defined(__aarch64__) - vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f); -#endif // __aarch64__ - - const auto temp16x8t_1 = - wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = - wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = - wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); - - //re-quantize - vec_res_value1_f = - wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); - vec_res_value2_f = - wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); - vec_res_value3_f = - wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); - vec_res_value4_f = - wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); - - vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f); - - const auto temp16x8t_1 = - wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = - wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - float res_value = 0.f; - int32_t res_value_q = 0; - - switch (op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - res_value = *(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - res_value = static_cast<T>(1.0f); - break; - } - default: - { - res_value = static_cast<T>(0.0f); - break; - } - } - uint32_t res_idx = 0; - - for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - const T *in_ptr = - reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); - switch (op) - { - case ReductionOperation::SUM: - { - res_value += *in_ptr; - break; - } - case ReductionOperation::MEAN_SUM: - { - res_value_q += *in_ptr; - break; - } - case ReductionOperation::SUM_SQUARE: - { - res_value += *in_ptr * *in_ptr; - break; - } - case ReductionOperation::PROD: - { - //de-quantize input - if (std::is_same<T, uint8_t>::value) - { - res_value *= dequantize_qasymm8(*in_ptr, iq_info); - } - else - { - res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); - } - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - if (*in_ptr < res_value) - { - res_value = *in_ptr; - res_idx = dim; - } - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - if (*in_ptr > res_value) - { - res_value = *in_ptr; - res_idx = dim; - } - break; - } - case ReductionOperation::MIN: - { - res_value = *in_ptr < res_value ? *in_ptr : res_value; - break; - } - case ReductionOperation::MAX: - { - res_value = *in_ptr > res_value ? *in_ptr : res_value; - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch (op) - { - case ReductionOperation::MEAN_SUM: - { - // Apply previously calculated coefficients (with rounding on aarch64) -#ifdef __aarch64__ - const int32_t res = - arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B); -#else // defined(__aarch64__) - const int32_t res = A * (static_cast<float>(res_value_q)) + B; -#endif // __aarch64__ - *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); - break; - } - case ReductionOperation::SUM: - { - // Subtract accumulated offsets - res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; - *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value); - break; - } - case ReductionOperation::PROD: - { - //re-quantize result - T res = 0; - if (std::is_same<T, uint8_t>::value) - { - res = quantize_qasymm8(res_value, iq_info); - } - else - { - res = quantize_qasymm8_signed(res_value, iq_info); - } - *(reinterpret_cast<T *>(output.ptr() + x)) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx; - break; - } - default: - *(reinterpret_cast<T *>(output.ptr() + x)) = res_value; - } - } - }, - input, output); - } -}; - -void reduce_op( - const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) +void NEReductionOperationKernel::reduce_op() { - const bool is_complex = (input->info()->num_channels() == 2); + const bool is_complex = (_input->info()->num_channels() == 2); if (is_complex) { - switch (axis) + switch (_reduction_axis) { case 2: - switch (input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: - switch (op) + { + switch (_op) { case ReductionOperation::SUM: - return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ( - window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), - op); + _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM); + break; default: ARM_COMPUTE_ERROR("Not supported"); + break; } + break; + } default: + { ARM_COMPUTE_ERROR("Not supported"); + break; + } } + break; default: + { ARM_COMPUTE_ERROR("Not supported"); + break; + } } return; } - switch (axis) + switch (_reduction_axis) { case 0: { - switch (input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, - RedOpX_quantized<uint8_t>(), op); + _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpX_reduceX_qasymm8); + break; } case DataType::QASYMM8_SIGNED: { - return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), - op); + _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpX_reduceX_qasymm8_signed); + break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: - return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + _func = REGISTER_FP16_NEON(cpu::reduce_RedOpX_reduceX_float16_8); + break; + } +#endif // ARM_COMPUTE_ENABLE_FP16 case DataType::F32: { - return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); + _func = REGISTER_FP32_NEON(cpu::reduce_RedOpX_reduceX_float32_4); + break; } case DataType::S32: { - return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op); + _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpX_reduceX_S32_4); + break; } default: { ARM_COMPUTE_ERROR("Not supported"); + break; } } + break; } case 1: - switch (input->info()->data_type()) + { + switch (_input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, - RedOpYZW_quantized<uint8_t>(), op); + _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpYZW_reduceY_qasymm8); + break; } case DataType::QASYMM8_SIGNED: { - return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, - RedOpYZW_quantized<int8_t>(), op); + _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpYZW_reduceY_qasymm8_signed); + break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), - op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + _func = REGISTER_FP16_NEON(cpu::reduce_RedOpYZW_reduceY_float16_8); + break; + } +#endif // ARM_COMPUTE_ENABLE_FP16 case DataType::F32: - return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op); + { + _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_reduceY_float32_4); + break; + } case DataType::S32: - return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op); + { + _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpYZW_reduceY_S32_4); + break; + } default: + { ARM_COMPUTE_ERROR("Not supported"); + break; + } } + break; + } case 2: - switch (input->info()->data_type()) + { + switch (_input->info()->data_type()) { case DataType::QASYMM8: - return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, - RedOpYZW_quantized<uint8_t>(), op); + { + _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpYZW_reduceZ_qasymm8); + break; + } case DataType::QASYMM8_SIGNED: - return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, - RedOpYZW_quantized<int8_t>(), op); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpYZW_reduceZ_qasymm8_signed); + break; + } +#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), - op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + _func = REGISTER_FP16_NEON(cpu::reduce_RedOpYZW_reduceZ_float16_8); + break; + } +#endif // ARM_COMPUTE_ENABLE_FP16 case DataType::F32: - return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op); + { + _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_reduceZ_float32_4); + break; + } case DataType::S32: - return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op); + { + _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpYZW_reduceZ_S32_4); + break; + } default: + { + std::cout << int(_input->info()->data_type()) << std::endl; ARM_COMPUTE_ERROR("Not supported"); + break; + } } + break; + } case 3: - switch (input->info()->data_type()) + { + switch (_input->info()->data_type()) { case DataType::QASYMM8: - return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, - RedOpYZW_quantized<uint8_t>(), op); + { + _func = REGISTER_QASYMM8_NEON(cpu::reduce_RedOpYZW_reduceW_qasymm8); + break; + } case DataType::QASYMM8_SIGNED: - return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, - RedOpYZW_quantized<int8_t>(), op); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + _func = REGISTER_QASYMM8_SIGNED_NEON(cpu::reduce_RedOpYZW_reduceW_qasymm8_signed); + break; + } +#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), - op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + _func = REGISTER_FP16_NEON(cpu::reduce_RedOpYZW_reduceW_float16_8); + break; + } +#endif // ARM_COMPUTE_ENABLE_FP16 case DataType::F32: - return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op); + { + _func = REGISTER_FP32_NEON(cpu::reduce_RedOpYZW_reduceW_float32_4); + break; + } case DataType::S32: - return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op); + { + _func = REGISTER_INTEGER_NEON(cpu::reduce_RedOpYZW_reduceW_S32_4); + break; + } default: + { ARM_COMPUTE_ERROR("Not supported"); + break; + } } + break; + } default: + { ARM_COMPUTE_ERROR("Unsupported reduction axis"); + break; + } } } @@ -1819,10 +293,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u return Status{}; } -} // namespace NEReductionOperationKernel::NEReductionOperationKernel() - : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE) + : _func(nullptr), _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE) { } @@ -1856,6 +329,8 @@ void NEReductionOperationKernel::configure(const ITensor *input, .set_data_type(output_data_type) .reset_padding() .set_is_resizable(true)); + // Determine the reduction function + NEReductionOperationKernel::reduce_op(); } Status NEReductionOperationKernel::validate(const ITensorInfo *input, @@ -1874,6 +349,6 @@ void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - reduce_op(window, _input, _output, _reduction_axis, _op); + (*_func)(window, _input, _output, _op); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h index 78bec62c14..407e5de6d6 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.h +++ b/src/core/NEON/kernels/NEReductionOperationKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H -#define ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H +#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREDUCTIONOPERATIONKERNEL_H +#define ACL_SRC_CORE_NEON_KERNELS_NEREDUCTIONOPERATIONKERNEL_H #include "src/core/NEON/INEKernel.h" @@ -80,14 +80,24 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); +private: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; + /** Common signature for all the specialized Reduction functions + * + * @param[in] window Region on which to execute the kernel. + */ + using ReductionFunction = void (*)(const Window &window, const ITensor *in, ITensor *out, ReductionOperation op); -private: + /** Populate the _func with the right reduction operation handler + */ + void reduce_op(); + + ReductionFunction _func; const ITensor *_input; ITensor *_output; unsigned int _reduction_axis; ReductionOperation _op; }; } // namespace arm_compute -#endif /*ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H */ +#endif // ACL_SRC_CORE_NEON_KERNELS_NEREDUCTIONOPERATIONKERNEL_H diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp index 59591935cd..7c09608e3e 100644 --- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022 Arm Limited. + * Copyright (c) 2020-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -330,11 +330,11 @@ template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, uns #endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM /* FP16 */ -#if defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16) template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); -#endif // FP16_KERNELS ar __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // FP16_KERNELS ar ARM_COMPUTE_ENABLE_FP16 template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t); diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp index 586d6a64a4..d9668aae02 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16)) #include "../performance_parameters.hpp" #include "../std_transforms_fixed.hpp" @@ -89,4 +89,4 @@ public: } // namespace arm_gemm -#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // __aarch64__ && (FP16_KERNELS || ARM_COMPUTE_ENABLE_FP16) diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp index a81d4504ae..ba47e0aa54 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(ARM_COMPUTE_ENABLE_FP16)) template<> void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const __fp16 *bias, Activation act, bool append) @@ -86,7 +86,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -140,7 +140,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -217,7 +217,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -317,7 +317,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -439,7 +439,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -584,7 +584,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -752,7 +752,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -944,7 +944,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1150,7 +1150,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1204,7 +1204,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1278,7 +1278,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1372,7 +1372,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1485,7 +1485,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1618,7 +1618,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1771,7 +1771,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -1945,7 +1945,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( -#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifndef ARM_COMPUTE_ENABLE_FP16 ".arch armv8.2-a+fp16\n" #endif "dup v0.8h, %[maxval].h[0]\n" @@ -2112,4 +2112,4 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } } -#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // __aarch64__ && (FP16_KERNELS || ARM_COMPUTE_ENABLE_FP16) diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp index 45e4f0e1de..06d9e2416c 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.cpp +++ b/src/core/NEON/kernels/arm_gemm/transform.cpp @@ -129,17 +129,17 @@ void Transform( // We don't have assembler transforms for AArch32, generate templated ones here. #ifdef __arm__ template void Transform<8, 1, true, VLType::None>(float *, const float *, int, int, int, int, int); -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ARM_COMPUTE_ENABLE_FP16) template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int); -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(ARM_COMPUTE_ENABLE_FP16) #ifdef ARM_COMPUTE_ENABLE_BF16 template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int); #endif // ARM_COMPUTE_ENABLE_BF16 #endif // AArch32 -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ARM_COMPUTE_ENABLE_FP16) template void Transform<12, 1, false, VLType::None>(float *, const __fp16 *, int, int, int, int, int); -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(ARM_COMPUTE_ENABLE_FP16) #ifdef ARM_COMPUTE_ENABLE_BF16 template void Transform<12, 1, false, VLType::None>(float *, const bfloat16 *, int, int, int, int, int); #endif // ARM_COMPUTE_ENABLE_BF16 diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp index 06e35eed8c..2effffbe92 100644 --- a/src/core/helpers/LUTManager.cpp +++ b/src/core/helpers/LUTManager.cpp @@ -30,17 +30,38 @@ namespace arm_compute namespace { -void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut) +float16_t activation(float16_t x, const LUTInfo &info) +{ + float16_t out = 0.f; + switch (info.act) + { + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + out = 1.f / (1.f + std::exp(-x)); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + { + out = static_cast<float16_t>(info.alpha * std::tanh(info.beta * x)); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported Activation for 16-bit LUT table"); + break; + } + return out; +} + +void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut, const LUTInfo &info) { union Element { uint16_t i = 0; float16_t fp; } item; + // Fill lut by iterating over all 16 bit values using the union. while (true) { - (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp)); + (*lut)[item.i] = activation(item.fp, info); if (item.i == 65535) break; item.i++; @@ -62,7 +83,7 @@ std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table // Not found, or pointer not valid // We do not use make_shared to prevent the weak_ptr keeping the control block alive std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536); - init_lut_fp16(ptr.get()); + init_lut_fp16(ptr.get(), info); map_fp16[info] = ptr; return ptr; } diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h index 4e13ead7e3..f3f4bf2832 100644 --- a/src/core/helpers/LUTManager.h +++ b/src/core/helpers/LUTManager.h @@ -38,19 +38,23 @@ namespace arm_compute struct LUTInfo { ActivationLayerInfo::ActivationFunction act; + float alpha; + float beta; DataType dt; - QuantizationInfo qinfo; + UniformQuantizationInfo qinfo; + // Operators enable use of map with Lutinfo as key friend bool operator<(const LUTInfo &l, const LUTInfo &r) { - return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) || - ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) || - ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) && - (l.qinfo.offset() < l.qinfo.offset())); + const auto l_tup = std::make_tuple(l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset); + const auto r_tup = std::make_tuple(r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset); + + return l_tup < r_tup; } - bool operator==(const LUTInfo &l) + bool operator==(const LUTInfo &l) const { - return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo; + return this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && this->dt == l.dt && + this->qinfo == l.qinfo; } }; diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index 7cfa39b286..4253027231 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -43,6 +43,13 @@ namespace kernels { namespace { + +bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func) +{ + return func == ActivationLayerInfo::ActivationFunction::LOGISTIC || + func == ActivationLayerInfo::ActivationFunction::TANH; +} + static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = { #ifdef ARM_COMPUTE_ENABLE_SVE {"sve2_q8_activation_lut", @@ -85,10 +92,7 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, {"sve_fp16_activation_lut", [](const ActivationDataTypeISASelectorData &data) - { - return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && - data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC; - }, + { return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)}, {"sve_fp16_activation", [](const ActivationDataTypeISASelectorData &data) @@ -299,10 +303,10 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac activation_info.setLookupTable256(tmp_lut); } - if (src->data_type() == DataType::F16 && - activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) + if (std::string(uk->name) == "sve_fp16_activation_lut") { - const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()}; + const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(), + src->quantization_info().uniform()}; activation_info.setLookupTable65536((lut_manager.get_lut_table(info))); } #endif // __aarch64__ diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp index d17128b5ac..5595ace998 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.cpp +++ b/src/cpu/kernels/CpuDequantizeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,12 +29,14 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/dequantize/generic/neon/list.h" #include <arm_neon.h> @@ -62,301 +64,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) return Status{}; } - -template <typename T> -inline void store_result(T *ptr, const float32x4x4_t &v) -{ - ARM_COMPUTE_UNUSED(ptr, v); -} - -template <> -inline void store_result<float>(float *ptr, const float32x4x4_t &v) -{ - wrapper::vstore(ptr, v.val[0]); - wrapper::vstore(ptr + 4, v.val[1]); - wrapper::vstore(ptr + 8, v.val[2]); - wrapper::vstore(ptr + 12, v.val[3]); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) -{ - wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); - wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template <typename T> -inline void store_result(T *ptr, const float32x4x2_t &v) -{ - ARM_COMPUTE_UNUSED(ptr, v); -} - -template <> -inline void store_result<float>(float *ptr, const float32x4x2_t &v) -{ - wrapper::vstore(ptr, v.val[0]); - wrapper::vstore(ptr + 4, v.val[1]); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v) -{ - wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template <typename TOut, typename TIn> -void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window) -{ - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); - const float scale = qinfo.scale; - const int32_t offset = qinfo.offset; - - const int window_step_x = 16; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr()); - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale, offset); - - store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto val = *(in_ptr + x); - *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo)); - } - }, - in, out); -} - -template <typename T> -void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window) -{ - const auto scale = input->info()->quantization_info().scale(); - - const int window_step_x = 16; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Reset first dimension to handle tail calculations manually - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win); - Iterator out(output, win); - - execute_window_loop( - win, - [&](const Coordinates &id) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale[id.z()]); - - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()])); - } - }, - in, out); -} - -template <typename T> -void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window) -{ - const auto scale = input->info()->quantization_info().scale(); - - const int window_step_x = 16; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Reset first dimension to handle tail calculations manually - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win); - Iterator out(output, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4], - scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9], - scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13], - scale[x + 14], scale[x + 15]}}; - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, vscale); - - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x])); - } - }, - in, out); -} - -template <typename T> -void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window) -{ - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); - const float scale = qinfo.scale; - - const int window_step_x = 16; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale); - - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize(val, scale)); - } - }, - in, out); -} - -template <typename T> -void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window) -{ - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); - const float scale = qinfo.scale; - - const int window_step_x = 8; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize_int16(vin, scale); - - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int16_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale)); - } - }, - in, out); -} - -template <typename T> -void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) -{ - switch (input->info()->data_type()) - { - case DataType::QASYMM8: - run_dequantization_qasymm8<T, uint8_t>(input, output, window); - break; - case DataType::QASYMM8_SIGNED: - run_dequantization_qasymm8<T, int8_t>(input, output, window); - break; - case DataType::QSYMM8_PER_CHANNEL: - input->info()->data_layout() == DataLayout::NHWC - ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) - : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window); - break; - case DataType::QSYMM8: - run_dequantization_qsymm8<T>(input, output, window); - break; - case DataType::QSYMM16: - run_dequantization_qsymm16<T>(input, output, window); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } -} } // namespace void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) @@ -370,6 +77,20 @@ void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); ICpuKernel::configure(win); + + switch (dst->data_type()) + { + case DataType::F32: + _func = REGISTER_FP32_NEON(fp32_run_dequantization_core); + break; +#ifdef ARM_COMPUTE_ENABLE_FP16 + case DataType::F16: + _func = REGISTER_FP16_NEON(fp16_run_dequantization_core); + break; +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } } Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) @@ -386,20 +107,7 @@ void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, con const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - - switch (dst->info()->data_type()) - { - case DataType::F32: - run_dequantization_core<float>(src, dst, window); - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - run_dequantization_core<float16_t>(src, dst, window); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } + _func(src, dst, window); } const char *CpuDequantizeKernel::name() const { diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h index 6ed58587c9..d8b6444f0a 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.h +++ b/src/cpu/kernels/CpuDequantizeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H -#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -56,8 +56,16 @@ public: // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; + +private: + /** Common signature for all the specialised @ref CpuDequantizeKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using DequantizeFunctionExecutorPtr = void (*)(const ITensor *input, ITensor *output, const Window &window); + DequantizeFunctionExecutorPtr _func{nullptr}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp index d2ac6cf8ac..ed4675ae3d 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.cpp +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/quantize/generic/neon/list.h" #include <arm_neon.h> #include <map> @@ -47,7 +47,6 @@ namespace kernels { namespace { -constexpr auto window_step = 16; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { @@ -63,59 +62,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) return Status{}; } -template <typename T> -inline float32x4x4_t load_value(const T *input_ptr) -{ - using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type; - return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr)); -} - -template <> -inline float32x4x4_t load_value(const float *input_ptr) -{ - return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), - wrapper::vloadq(input_ptr + 12)}; -} -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float32x4x4_t load_value(const float16_t *input_ptr) -{ - return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), - vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; -} - -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <typename element_type> -using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>; - -template <typename quantized_type> -vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); - -template <> -vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - return vquantize(qv, qi); -} - -template <> -vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - return vquantize_signed(qv, qi); -} - -template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type> -inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) -{ - return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper)); -} - -template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type> -inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) -{ - return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper)); -} - } // namespace void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) @@ -124,38 +70,36 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = { - {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>}, - {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>}, - {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>}, + {"op_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_quantize_qasymm8)}, + {"op_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_quantize_qasymm8)}, + {"op_QASYMM8_QASYMM16", REGISTER_INTEGER_NEON(u8_run_quantize_qasymm16)}, - {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>}, - {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>}, - {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>}, + {"op_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_quantize_qasymm8)}, + {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_quantize_qasymm8)}, + {"op_QASYMM8_SIGNED_QASYMM16", REGISTER_INTEGER_NEON(i8_run_quantize_qasymm16)}, // Functions for offset only requantization - {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>}, - {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>}, - {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>}, - {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", - &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>}, + {"op_OFFSET_ONLY_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_requantize_offset_only)}, // Functions for offset uint8 to int8 and vice versa quantization (no scale changes) {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8", - &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>}, + REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only_convert)}, {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED", - &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>}, - - {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>}, - - {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>}, - {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>}, - {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>}, - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>}, - {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>}, - {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>}, -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ + REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only_convert)}, + + {"op_F32_QSYMM8", REGISTER_FP32_NEON(fp32_i8_run_quantize_qsymm8)}, + {"op_F32_QASYMM8", REGISTER_FP32_NEON(fp32_u8_run_quantize_qasymm8)}, + {"op_F32_QASYMM8_SIGNED", REGISTER_FP32_NEON(fp32_i8_run_quantize_qasymm8)}, + {"op_F32_QASYMM16", REGISTER_FP32_NEON(fp32_run_quantize_qasymm16)}, + +#ifdef ARM_COMPUTE_ENABLE_FP16 + {"op_F16_QASYMM8", REGISTER_FP16_NEON(fp16_u8_run_quantize_qasymm8)}, + {"op_F16_QASYMM8_SIGNED", REGISTER_FP16_NEON(fp16_i8_run_quantize_qasymm8)}, + {"op_F16_QASYMM16", REGISTER_FP16_NEON(fp16_run_quantize_qasymm16)}, +#endif /* ARM_COMPUTE_ENABLE_FP16 */ }; std::string function_to_call("op_"); @@ -203,242 +147,6 @@ Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds return Status{}; } -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); - } - }, - input, output); -} - -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Calculate output offset difference. - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Duplicate offset in signed vector format - const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - const wrapper::traits::neon_vector_t<TIn, window_step> qv = - wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype - - // Signed addition. - auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset); - - // Output is dependent on datatype. - wrapper::vstore(&output_ptr[x], - reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); - output_ptr[x] = static_cast<TOut>(result); - } - }, - input, output); -} - -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Duplicate offset in signed vector format - const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); - - const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128; - const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127; - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype - int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv))); - int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv))); - - // Signed addition. - lower = wrapper::vqadd(lower, offset); - upper = wrapper::vqadd(upper, offset); - - // Output is dependent on datatype. - auto res = recombine_8_16<TOut>(lower, upper); - wrapper::vstore(&output_ptr[x], res); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - // Add offset and clamp result to within the range of the output datatype. - int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); - result = utility::clamp<int32_t>(result, low_bound, upper_bound); - - // Cast result to output datatype. - output_ptr[x] = static_cast<TOut>(result); - } - }, - input, output); -} - -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if (is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); -} - -template <typename T> -void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if (is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); - vst1q_u16(&output_ptr[x], tmp.val[0]); - vst1q_u16(&output_ptr[x + 8], tmp.val[1]); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); -} - void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -448,7 +156,7 @@ void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - (this->*_func)(src, dst, window); + (*_func)(src, dst, window); } const char *CpuQuantizeKernel::name() const diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index c2f7ac6d9d..750310c811 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -76,31 +76,7 @@ private: * * @param[in] window Region on which to execute the kernel. */ - using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, - ITensor *dst, - const Window &window); - /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor. - * - * @param[in] window Region on which to execute the kernel. - */ - template <typename TIn, typename TOut> - void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window); - /** Function to apply QASYMM16 quantization on a tensor. - * - * @param[in] window Region on which to execute the kernel. - */ - template <typename T> - void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window); - - template <typename TIn, typename TOut> - void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window); - - template <typename TIn, typename TOut> - void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window); - - template <typename TIn, typename TOut> - void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window); - + using QuantizeFunctionExecutorPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window); QuantizeFunctionExecutorPtr _func{nullptr}; size_t _split_dimension{Window::DimY}; }; diff --git a/src/cpu/kernels/dequantize/generic/neon/fp16.cpp b/src/cpu/kernels/dequantize/generic/neon/fp16.cpp new file mode 100644 index 0000000000..caffdf53e1 --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/fp16.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/dequantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + run_dequantization_core<float16_t>(input, output, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/dequantize/generic/neon/fp32.cpp b/src/cpu/kernels/dequantize/generic/neon/fp32.cpp new file mode 100644 index 0000000000..58e987b450 --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/fp32.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/dequantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + run_dequantization_core<float>(input, output, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/dequantize/generic/neon/impl.h b/src/cpu/kernels/dequantize/generic/neon/impl.h new file mode 100644 index 0000000000..7197d4dff6 --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/impl.h @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/dequantize/generic/neon/list.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ + +template <typename T> +inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> +inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename T> +inline void store_result(T *ptr, const float32x4x2_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> +inline void store_result<float>(float *ptr, const float32x4x2_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename TOut, typename TIn> +void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + const int32_t offset = qinfo.offset; + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr()); + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale, offset); + + store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto val = *(in_ptr + x); + *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window) +{ + const auto scale = input->info()->quantization_info().scale(); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Reset first dimension to handle tail calculations manually + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win); + Iterator out(output, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale[id.z()]); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()])); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window) +{ + const auto scale = input->info()->quantization_info().scale(); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Reset first dimension to handle tail calculations manually + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win); + Iterator out(output, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4], + scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9], + scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13], + scale[x + 14], scale[x + 15]}}; + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, vscale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x])); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize_int16(vin, scale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int16_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + run_dequantization_qasymm8<T, uint8_t>(input, output, window); + break; + case DataType::QASYMM8_SIGNED: + run_dequantization_qasymm8<T, int8_t>(input, output, window); + break; + case DataType::QSYMM8_PER_CHANNEL: + input->info()->data_layout() == DataLayout::NHWC + ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) + : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window); + break; + case DataType::QSYMM8: + run_dequantization_qsymm8<T>(input, output, window); + break; + case DataType::QSYMM16: + run_dequantization_qsymm16<T>(input, output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/dequantize/generic/neon/list.h b/src/cpu/kernels/dequantize/generic/neon/list.h new file mode 100644 index 0000000000..678eb2c01a --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/list.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_DEQUANTIZE_KERNEL(func_name) void func_name(const ITensor *input, ITensor *output, const Window &window) + +DECLARE_DEQUANTIZE_KERNEL(fp32_run_dequantization_core); +DECLARE_DEQUANTIZE_KERNEL(fp16_run_dequantization_core); + +#undef DECLARE_DEQUANTIZE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp index 60fda511e3..6a93be0618 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Arm Limited. + * Copyright (c) 2022-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -81,7 +81,7 @@ void vector_matrix_multiply_f16( // window_end_x is computed above which may cause out-of-bound writes to the dst. for (; x < (window_end_x - window_step_x); x += window_step_x) { - if (x > width_matrix_b) + if (x >= width_matrix_b) { return; } @@ -176,7 +176,7 @@ void vector_matrix_multiply_f16( for (; x < window_end_x; ++x) { - if (x > width_matrix_b) + if (x >= width_matrix_b) { return; } diff --git a/src/cpu/kernels/quantize/generic/neon/fp16.cpp b/src/cpu/kernels/quantize/generic/neon/fp16.cpp new file mode 100644 index 0000000000..37bfb5b2aa --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/fp16.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float16_t, uint8_t>(src, dst, window); +} +void fp16_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float16_t, int8_t>(src, dst, window); +} +void fp16_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<float16_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/quantize/generic/neon/fp32.cpp b/src/cpu/kernels/quantize/generic/neon/fp32.cpp new file mode 100644 index 0000000000..0cba332fd6 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/fp32.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float, uint8_t>(src, dst, window); +} +void fp32_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float, int8_t>(src, dst, window); +} +void fp32_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<float>(src, dst, window); +} + +void fp32_i8_run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qsymm8<float, int8_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h new file mode 100644 index 0000000000..9954a7645e --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/impl.h @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +constexpr auto window_step = 16; + +template <typename T> +inline float32x4x4_t load_value(const T *input_ptr) +{ + using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type; + return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr)); +} + +template <> +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), + wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename element_type> +using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>; + +template <typename quantized_type> +inline vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); + +template <> +inline vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize(qv, qi); +} + +template <> +inline vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize_signed(qv, qi); +} + +template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type> +inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper)); +} + +template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type> +inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper)); +} + +template <typename TIn, typename TOut> +void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Calculate output offset difference. + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const wrapper::traits::neon_vector_t<TIn, window_step> qv = + wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + + // Signed addition. + auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset); + + // Output is dependent on datatype. + wrapper::vstore(&output_ptr[x], + reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128; + const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127; + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv))); + int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv))); + + // Signed addition. + lower = wrapper::vqadd(lower, offset); + upper = wrapper::vqadd(upper, offset); + + // Output is dependent on datatype. + auto res = recombine_8_16<TOut>(lower, upper); + wrapper::vstore(&output_ptr[x], res); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Add offset and clamp result to within the range of the output datatype. + int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + result = utility::clamp<int32_t>(result, low_bound, upper_bound); + + // Cast result to output datatype. + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} + +template <typename T> +void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/quantize/generic/neon/integer.cpp b/src/cpu/kernels/quantize/generic/neon/integer.cpp new file mode 100644 index 0000000000..4e39afaaee --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/integer.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void u8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<uint8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<uint8_t, int8_t>(src, dst, window); +} +void i8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<int8_t, uint8_t>(src, dst, window); +} +void i8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<int8_t, int8_t>(src, dst, window); +} + +void u8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<uint8_t>(src, dst, window); +} +void i8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<int8_t>(src, dst, window); +} + +void u8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<uint8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<uint8_t, int8_t>(src, dst, window); +} +void i8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<int8_t, uint8_t>(src, dst, window); +} +void i8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<int8_t, int8_t>(src, dst, window); +} + +void i8_u8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only_convert<int8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only_convert<uint8_t, int8_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/list.h b/src/cpu/kernels/quantize/generic/neon/list.h new file mode 100644 index 0000000000..c4fb1048eb --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/list.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_QUANTIZE_KERNEL(func_name) void func_name(const ITensor *src, ITensor *dst, const Window &window) + +DECLARE_QUANTIZE_KERNEL(u8_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(i8_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(i8_i8_run_quantize_qasymm8); + +DECLARE_QUANTIZE_KERNEL(u8_u8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(i8_i8_run_requantize_offset_only); + +DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only_convert); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only_convert); + +DECLARE_QUANTIZE_KERNEL(u8_run_quantize_qasymm16); +DECLARE_QUANTIZE_KERNEL(i8_run_quantize_qasymm16); + +DECLARE_QUANTIZE_KERNEL(fp32_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp32_run_quantize_qasymm16); + +DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qsymm8); + +DECLARE_QUANTIZE_KERNEL(fp16_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp16_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp16_run_quantize_qasymm16); + +#undef DECLARE_QUANTIZE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp b/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp new file mode 100644 index 0000000000..143bb5487f --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op); +} + +void reduce_RedOpYZW_reduceY_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op); +} + +void reduce_RedOpYZW_reduceZ_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op); +} + +void reduce_RedOpYZW_reduceW_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp b/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp new file mode 100644 index 0000000000..6f5f13e571 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ( + window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op); +} + +void reduce_RedOpX_reduceX_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); +} + +void reduce_RedOpYZW_reduceY_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op); +} + +void reduce_RedOpYZW_reduceZ_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op); +} + +void reduce_RedOpYZW_reduceW_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/reduction_layer/generic/neon/impl.h b/src/cpu/kernels/reduction_layer/generic/neon/impl.h new file mode 100644 index 0000000000..3fa821d3a4 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/impl.h @@ -0,0 +1,1633 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "support/SaturateCast.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +// Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized +template <typename T> +void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0) +{ + if (std::is_same<T, uint8_t>::value) + { + auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2)); + wrapper::vstore(output.ptr() + offset, res); + } + else + { + auto res = wrapper::vcombine(wrapper::vqmovn(t1), wrapper::vqmovn(t2)); + wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res); + } +} + +template <typename T> +uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4_t mask{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + mask = wrapper::vcgt(b, a); + } + else + { + mask = wrapper::vclt(b, a); + } + + uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3}; + if (axis != 0) + { + vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}}; + + return res; +} + +template <typename T> +uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4x4_t mask{{0}}; + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + mask_u8 = wrapper::vcgt(b, a); + } + else + { + mask_u8 = wrapper::vclt(b, a); + } + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + mask.val[0] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + mask.val[1] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + mask.val[2] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + mask.val[3] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + + uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, + {idx + 4, idx + 5, idx + 6, idx + 7}, + {idx + 8, idx + 9, idx + 10, idx + 11}, + {idx + 12, idx + 13, idx + 14, idx + 15}}}; + if (axis != 0) + { + vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = { + {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), + vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}}; + + return res; +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, + typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type +calculate_min(T in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, + typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type +calculate_min(T in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, + typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type +calculate_max(T in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, + typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type +calculate_max(T in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +template <typename T> +uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) +{ + uint32x4_t res_idx_mask{0}; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + + if (op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = calculate_min(vec_res_value); + auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); + } + else + { + auto pmax = calculate_max(vec_res_value); + auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); + } + + res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones); + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask)); + pmin = wrapper::vpmin(pmin, pmin); + uint32_t res = wrapper::vgetlane(pmin, 0); + + return (res - 0xFFFFFFFF); +} + +template <typename T> +uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) +{ + uint32x4x4_t res_idx_mask{{0}}; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = calculate_min(vec_res_value); + mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + } + else + { + auto pmax = calculate_max(vec_res_value); + mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + } + + // Widen vectors + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + auto wide_u32_3 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + auto wide_u32_4 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); + res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); + res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); + res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4); + res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); + res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); + res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones); + res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones); + + uint32_t res = 0xFFFFFFFF; + int iter = 0; + do + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); + pmin = wrapper::vpmin(pmin, pmin); + res = std::min(wrapper::vgetlane(pmin, 0), res); + iter++; + } while (iter < 4); + + return (res - 0xFFFFFFFF); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +uint32x4x4_t inline calculate_index( + uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4x2_t mask{0}; + uint16x8_t mask_u16{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + mask_u16 = wrapper::vcgt(b, a); + } + else + { + mask_u16 = wrapper::vclt(b, a); + } + mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); + mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); + uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}}; + if (axis != 0) + { + vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), + wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0}; + + return res; +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. +inline float16x4_t calculate_min(float16x8_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. +inline float16x4_t calculate_max(float16x8_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +template <> +inline uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) +{ + uint32x4x2_t res_idx_mask{0}; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + uint16x8_t mask_u16; + if (op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = calculate_min(vec_res_value); + mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + } + else + { + auto pmax = calculate_max(vec_res_value); + mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + } + + // Widen vectors + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); + res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); + res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); + res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); + res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); + + uint32_t res = 0xFFFFFFFF; + uint32_t iter = 0; + do + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); + pmin = wrapper::vpmin(pmin, pmin); + res = std::min(wrapper::vgetlane(pmin, 0), res); + iter++; + } while (iter < 2); + + return (res - 0xFFFFFFFF); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <class F> +class Reducer +{ +public: + static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + f(window, out_window, input, output, op); + } + static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1))); + + f(in_window, out_window, input, output, 1, op); + } + static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2))); + + f(in_window, out_window, input, output, 2, op); + } + static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set in/out window + Window in_window(window); + Window out_window(window); + + in_window.set(3, Window::Dimension(0, 1, 1)); + out_window.set(3, Window::Dimension(0, 1, 1)); + + f(in_window, out_window, input, output, 3, op); + } +}; + +template <typename T, int S> +struct RedOpX +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + { + const size_t input_dim_0 = in->info()->dimension(0); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(in_window.x().start()); + const auto window_end_x = static_cast<int>(in_window.x().end()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_window); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + + auto init_res_value = static_cast<T>(0.f); + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + init_res_value = static_cast<T>(*input_ptr); + break; + } + case ReductionOperation::PROD: + { + init_res_value = static_cast<T>(1.f); + break; + } + default: + break; + } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + uint32x4x4_t vec_res_idx{{0}}; + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM_SQUARE: + { +#ifdef ARM_COMPUTE_DEBUG_ENABLED + auto res = static_cast<T>(0.f); + for (int i = 0; i < S; ++i) + { + res += wrapper::vgetlane(vec_res_value, i); + } +#else // ARM_COMPUTE_DEBUG_ENABLED + auto carry_res = + wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + for (int i = 0; i < S / 4; ++i) + { + carry_res = wrapper::vpadd(carry_res, carry_res); + } + auto res = wrapper::vgetlane(carry_res, 0); +#endif // ARM_COMPUTE_DEBUG_ENABLED + if (op == ReductionOperation::SUM_SQUARE) + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += (*(input_ptr + x)) * (*(input_ptr + x)); + } + } + else + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + res /= input_dim_0; + } + + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::PROD: + { + auto carry_res = + wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + T res = 1; + for (int i = 0; i < S / 2; ++i) + { + res *= wrapper::vgetlane(carry_res, i); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res *= *(input_ptr + x); + } + + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::MIN: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::MAX: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input, output); + } +}; + +template <typename T> +struct RedOpX_quantized +{ + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + { + using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + + const auto oq_info = out->info()->quantization_info().uniform(); + + const TensorInfo in_info = *(in->info()); + const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(in_window.x().start()); + const auto window_end_x = static_cast<int>(in_window.x().end()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_window); + + const auto in_offset = static_cast<float>(iq_info.offset); + const float in_scale = iq_info.scale; + + const auto out_offset = static_cast<float>(oq_info.offset); + const float out_scale = oq_info.scale; + + const auto num_elements = static_cast<float>(in_info.dimension(0)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + auto vec_res_value1 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value2 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value3 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value4 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + + auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f)); + + typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0}; + + if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || + op == ReductionOperation::MIN || op == ReductionOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); + } + + uint32x4x4_t vec_res_idx{{0}}; + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); + const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::ARG_IDX_MIN: + { + auto idx = + calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto idx = + calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::MIN: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::MAX: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::PROD: + { + auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); + carry_res = wrapper::vmul(carry_res, vec_res_value3_f); + carry_res = wrapper::vmul(carry_res, vec_res_value4_f); + + float res = wrapper::vgetlane(carry_res, 0); + res *= wrapper::vgetlane(carry_res, 1); + res *= wrapper::vgetlane(carry_res, 2); + res *= wrapper::vgetlane(carry_res, 3); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + //de-quantize input + if (std::is_same<T, uint8_t>::value) + { + res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + } + else + { + res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + } + } + + //re-quantize result + if (std::is_same<T, uint8_t>::value) + { + res = quantize_qasymm8(res, iq_info); + } + else + { + res = quantize_qasymm8_signed(res, iq_info); + } + + *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res); + break; + } + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); + carry_res = wrapper::vadd(carry_res, vec_res_value3); + carry_res = wrapper::vadd(carry_res, vec_res_value4); + + auto carry_paddition = + wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); + carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); + auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } + + if (op == ReductionOperation::MEAN_SUM) + { + const int32_t resFinal = A * (static_cast<float>(res)) + B; + + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal); + } + else + { + // Subtract accumulated offsets + res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); + } + + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input, output); + } +}; + +template <typename T, int S> +struct RedOpYZW +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) + { + const TensorInfo in_info = *(in->info()); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); + const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); + // As it split over x-axis, need to set the correct spiltted window start and end. + const auto window_start_x = static_cast<int>(0); + const auto window_end_x = static_cast<int>(in_window.shape().x()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); + Window out_win_no_pad = out_window; + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_win_no_pad); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + neon_vector vec_res_value = {0}; + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vloadq(input_ptr + x); + break; + } + case ReductionOperation::PROD: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); + break; + } + default: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + break; + } + } + uint32x4x4_t vec_res_idx{{0}}; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + auto vec_width_inv = + wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{})); + vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); + } + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (std::is_same<T, float16_t>::value) + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]); + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + } + else + { + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto res_value = 0.f; + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + res_value = *(input_ptr + x); + break; + } + case ReductionOperation::PROD: + { + res_value = static_cast<T>(1.f); + break; + } + default: + { + res_value = static_cast<T>(0.f); + break; + } + } + + uint32_t res_idx = 0; + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + res_value += *in_ptr; + break; + case ReductionOperation::SUM_SQUARE: + res_value += *in_ptr * *in_ptr; + break; + case ReductionOperation::PROD: + res_value *= *in_ptr; + break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + res_value /= in_info.dimension(axis); + } + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx; + } + else + { + *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value; + } + } + }, + input, output); + } +}; + +template <typename T, int S, int axis, ReductionOperation op> +struct RedOpYZW_complex +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) + { + ARM_COMPUTE_ERROR_ON(axis != 2); + ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM); + + const TensorInfo in_info = *(in->info()); + const size_t stride_z = in_info.strides_in_bytes()[axis]; + const int window_step_x = 16 / sizeof(T); + const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); + const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); + // As it split over x-axis, need to set the correct spiltted window start and end. + const auto window_start_x = static_cast<int>(0); + const auto window_end_x = static_cast<int>(in_window.shape().x()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); + Window out_win_no_pad = out_window; + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_win_no_pad); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + neon_vector vec_res_value_0 = {0}; + neon_vector vec_res_value_1 = {0}; + + vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); + + const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); + const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); + + vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); + vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); + } + + wrapper::vstore(out_ptr, vec_res_value_0); + wrapper::vstore(out_ptr + 4, vec_res_value_1); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto res_value_0 = 0.f; + auto res_value_1 = 0.f; + + T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + res_value_0 += *in_ptr; + res_value_1 += *(in_ptr + 1); + } + *out_ptr = res_value_0; + *(out_ptr + 1) = res_value_1; + } + }, + input, output); + } +}; + +template <typename T> +struct RedOpYZW_quantized +{ + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) + { + const TensorInfo in_info = *(in->info()); + const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); + using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + + const auto oq_info = out->info()->quantization_info().uniform(); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); + const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); + // As it split over x-axis, need to set the correct spiltted window start and end. + const auto window_start_x = static_cast<int>(0); + const auto window_end_x = static_cast<int>(in_window.shape().x()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); + Window out_win_no_pad = out_window; + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_win_no_pad); + + using vector_type = + typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type; + using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type; + + vector_type vec_res_value1{}; + vector_type vec_res_value2{}; + vector_type vec_res_value3{}; + vector_type vec_res_value4{}; + + vector_type_f vec_res_value1_f{}; + vector_type_f vec_res_value2_f{}; + vector_type_f vec_res_value3_f{}; + vector_type_f vec_res_value4_f{}; + + const float in_offset = static_cast<float>(iq_info.offset); + const float in_scale = iq_info.scale; + + const float out_offset = static_cast<float>(oq_info.offset); + const float out_scale = oq_info.scale; + + const float num_elements = static_cast<float>(in_info.dimension(axis)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + const auto vec_A = wrapper::vdup_n(static_cast<float>(A), wrapper::traits::vector_128_tag{}); + const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{}); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + uint32x4x4_t vec_res_idx{{0}}; + vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + + vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + + auto vec_res_value = wrapper::vloadq(input_ptr + x); + + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) + { + const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), + wrapper::traits::vector_128_tag{}); + const auto scale32x4f_4 = + wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, + vec_res_idx.val[3]); + break; + } + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value); + break; + } + case ReductionOperation::SUM: + { + // Subtract offsets + auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); + + auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); + auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); + auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); + auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); + + vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); + vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); + vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); + vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); + + combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x); + break; + } + case ReductionOperation::MEAN_SUM: + { + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); + +#ifdef __aarch64__ + vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f); +#else // defined(__aarch64__) + vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f); +#endif // __aarch64__ + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = + wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{}); + const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); + + //re-quantize + vec_res_value1_f = + wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); + vec_res_value2_f = + wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); + vec_res_value3_f = + wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); + vec_res_value4_f = + wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); + + vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + float res_value = 0.f; + int32_t res_value_q = 0; + + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + res_value = *(input_ptr + x); + break; + } + case ReductionOperation::PROD: + { + res_value = static_cast<T>(1.0f); + break; + } + default: + { + res_value = static_cast<T>(0.0f); + break; + } + } + uint32_t res_idx = 0; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); + switch (op) + { + case ReductionOperation::SUM: + { + res_value += *in_ptr; + break; + } + case ReductionOperation::MEAN_SUM: + { + res_value_q += *in_ptr; + break; + } + case ReductionOperation::SUM_SQUARE: + { + res_value += *in_ptr * *in_ptr; + break; + } + case ReductionOperation::PROD: + { + //de-quantize input + if (std::is_same<T, uint8_t>::value) + { + res_value *= dequantize_qasymm8(*in_ptr, iq_info); + } + else + { + res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + } + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::MEAN_SUM: + { + // Apply previously calculated coefficients (with rounding on aarch64) +#ifdef __aarch64__ + const int32_t res = + arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B); +#else // defined(__aarch64__) + const int32_t res = A * (static_cast<float>(res_value_q)) + B; +#endif // __aarch64__ + *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); + break; + } + case ReductionOperation::SUM: + { + // Subtract accumulated offsets + res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value); + break; + } + case ReductionOperation::PROD: + { + //re-quantize result + T res = 0; + if (std::is_same<T, uint8_t>::value) + { + res = quantize_qasymm8(res_value, iq_info); + } + else + { + res = quantize_qasymm8_signed(res_value, iq_info); + } + *(reinterpret_cast<T *>(output.ptr() + x)) = res; + break; + } + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: + { + *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx; + break; + } + default: + *(reinterpret_cast<T *>(output.ptr() + x)) = res_value; + } + } + }, + input, output); + } +}; + +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp b/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp new file mode 100644 index 0000000000..ad66b456ac --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op); +} + +void reduce_RedOpYZW_reduceY_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op); +} +void reduce_RedOpYZW_reduceZ_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op); +} + +void reduce_RedOpYZW_reduceW_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/reduction_layer/generic/neon/list.h b/src/cpu/kernels/reduction_layer/generic/neon/list.h new file mode 100644 index 0000000000..947c28a130 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/list.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_REDUCTION_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *in, ITensor *out, const ReductionOperation op) + +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM); +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_float32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_float32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_float32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_float32_4); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_float16_8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_float16_8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_float16_8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_float16_8); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_S32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_S32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_S32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_S32_4); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_qasymm8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_qasymm8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_qasymm8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_qasymm8); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_qasymm8_signed); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_qasymm8_signed); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_qasymm8_signed); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_qasymm8_signed); + +#undef DECLARE_REDUCTION_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..bc711c6855 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op); +} + +void reduce_RedOpYZW_reduceY_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op); +} + +void reduce_RedOpYZW_reduceZ_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op); +} + +void reduce_RedOpYZW_reduceW_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..10ac3d6715 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op); +} + +void reduce_RedOpYZW_reduceY_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op); +} + +void reduce_RedOpYZW_reduceZ_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op); +} + +void reduce_RedOpYZW_reduceW_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 7d85885654..a4c856bb8f 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -945,6 +945,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected } break; #endif /* __aarch64__ */ + #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: { @@ -963,13 +964,14 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected break; } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#if defined(ENABLE_FP16_KERNELS) case DataType::F16: ARM_COMPUTE_RETURN_ERROR_ON_MSG( !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), "We could not find an optimized kernel for F16 input and F16 output"); break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* ENABLE_FP16_KERNELS */ default: ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel"); break; @@ -1102,11 +1104,11 @@ void CpuGemmAssemblyDispatch::configure( } break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ENABLE_FP16_KERNELS case DataType::F16: create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info); break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* ENABLE_FP16_KERNELS */ default: break; } diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index d4d6193fce..baffa8cbb2 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,10 +32,21 @@ namespace arm_compute { +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) OMPScheduler::OMPScheduler() // NOLINT - : _num_threads(omp_get_max_threads()) + : _num_threads(cpu_info().get_cpu_num_excluding_little()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) { } +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ +OMPScheduler::OMPScheduler() // NOLINT + : _num_threads(omp_get_max_threads()), _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) +{ +} +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ unsigned int OMPScheduler::num_threads() const { @@ -45,7 +56,15 @@ unsigned int OMPScheduler::num_threads() const void OMPScheduler::set_num_threads(unsigned int num_threads) { const unsigned int num_cores = omp_get_max_threads(); - _num_threads = (num_threads == 0) ? num_cores : num_threads; +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) + const unsigned int adjusted_num_threads = std::min(_nonlittle_num_cpus, num_threads); + _num_threads = (num_threads == 0) ? num_cores : adjusted_num_threads; +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ + _num_threads = (num_threads == 0) ? num_cores : num_threads; +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ } void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) @@ -99,9 +118,15 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> } ThreadInfo info; - info.cpu_info = &cpu_info(); + info.cpu_info = &cpu_info(); + +#if !defined(__ANDROID__) + info.num_threads = _num_threads; +#else /* !__ANDROID__ */ info.num_threads = num_threads_to_use; -#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \ +#endif /* __ANDROID__ */ + +#pragma omp parallel for firstprivate(info) num_threads(info.num_threads) default(shared) proc_bind(close) \ schedule(static, 1) for (unsigned int wid = 0; wid < amount_of_work; ++wid) { diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp index 819811943d..e0d45c639a 100644 --- a/tests/validation/NEON/UNIT/RuntimeContext.cpp +++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,6 +48,19 @@ namespace validation { TEST_SUITE(NEON) TEST_SUITE(UNIT) +#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) +TEST_CASE(CpuCapacity, framework::DatasetMode::ALL) +{ + CPUInfo& ci = arm_compute::Scheduler::get().cpu_info(); + const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little(); + const uint32_t num_threads = arm_compute::Scheduler::get().num_threads(); + + ARM_COMPUTE_EXPECT(num_threads<=nonlittle_num_cpus , framework::LogLevel::ERRORS); +} +#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ + TEST_SUITE(RuntimeContext) TEST_CASE(Scheduler, framework::DatasetMode::ALL) |